summaryrefslogtreecommitdiff
path: root/include/linux/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/ceph')
-rw-r--r--include/linux/ceph/auth.h120
-rw-r--r--include/linux/ceph/buffer.h8
-rw-r--r--include/linux/ceph/ceph_debug.h49
-rw-r--r--include/linux/ceph/ceph_features.h247
-rw-r--r--include/linux/ceph/ceph_frag.h42
-rw-r--r--include/linux/ceph/ceph_fs.h294
-rw-r--r--include/linux/ceph/ceph_hash.h1
-rw-r--r--include/linux/ceph/cls_lock_client.h58
-rw-r--r--include/linux/ceph/debugfs.h25
-rw-r--r--include/linux/ceph/decode.h176
-rw-r--r--include/linux/ceph/libceph.h238
-rw-r--r--include/linux/ceph/mdsmap.h63
-rw-r--r--include/linux/ceph/messenger.h516
-rw-r--r--include/linux/ceph/mon_client.h81
-rw-r--r--include/linux/ceph/msgpool.h13
-rw-r--r--include/linux/ceph/msgr.h84
-rw-r--r--include/linux/ceph/osd_client.h583
-rw-r--r--include/linux/ceph/osdmap.h288
-rw-r--r--include/linux/ceph/pagelist.h31
-rw-r--r--include/linux/ceph/rados.h343
-rw-r--r--include/linux/ceph/string_table.h63
-rw-r--r--include/linux/ceph/striper.h71
-rw-r--r--include/linux/ceph/types.h2
23 files changed, 2556 insertions, 840 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 5f3386844134..6b138fa97db8 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_AUTH_H
#define _FS_CEPH_AUTH_H
@@ -12,7 +13,11 @@
*/
struct ceph_auth_client;
-struct ceph_authorizer;
+struct ceph_msg;
+
+struct ceph_authorizer {
+ void (*destroy)(struct ceph_authorizer *);
+};
struct ceph_auth_handshake {
struct ceph_authorizer *authorizer;
@@ -20,11 +25,13 @@ struct ceph_auth_handshake {
size_t authorizer_buf_len;
void *authorizer_reply_buf;
size_t authorizer_reply_buf_len;
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
};
struct ceph_auth_client_ops {
- const char *name;
-
/*
* true if we are authenticated and can connect to
* services.
@@ -43,8 +50,10 @@ struct ceph_auth_client_ops {
* another request.
*/
int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
- int (*handle_reply)(struct ceph_auth_client *ac, int result,
- void *buf, void *end);
+ int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len);
/*
* Create authorizer for connecting to a service, and verify
@@ -55,10 +64,15 @@ struct ceph_auth_client_ops {
/* ensure that an existing authorizer is up to date */
int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type,
struct ceph_auth_handshake *auth);
+ int (*add_authorizer_challenge)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len);
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len);
- void (*destroy_authorizer)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
int peer_type);
@@ -66,6 +80,11 @@ struct ceph_auth_client_ops {
void (*reset)(struct ceph_auth_client *ac);
void (*destroy)(struct ceph_auth_client *ac);
+
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
};
struct ceph_auth_client {
@@ -79,11 +98,17 @@ struct ceph_auth_client {
const struct ceph_crypto_key *key; /* our secret key */
unsigned want_keys; /* which services we want */
+ int preferred_mode; /* CEPH_CON_MODE_* */
+ int fallback_mode; /* ditto */
+
struct mutex mutex;
};
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
- const struct ceph_crypto_key *key);
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id);
+
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes);
extern void ceph_auth_destroy(struct ceph_auth_client *ac);
extern void ceph_auth_reset(struct ceph_auth_client *ac);
@@ -93,24 +118,73 @@ extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
void *buf, size_t len,
void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
extern int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len);
-
extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *auth);
-extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
-extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *a);
-extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a,
- size_t len);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode);
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
+int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
int peer_type);
+static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->sign_message)
+ return auth->sign_message(auth, msg);
+ return 0;
+}
+
+static inline
+int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->check_message_signature)
+ return auth->check_message_signature(auth, msg);
+ return 0;
+}
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
#endif
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h
index 58d19014068f..11cdc7c60480 100644
--- a/include/linux/ceph/buffer.h
+++ b/include/linux/ceph/buffer.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_CEPH_BUFFER_H
#define __FS_CEPH_BUFFER_H
@@ -10,14 +11,12 @@
/*
* a simple reference counted buffer.
*
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
+ * use kmalloc for smaller sizes, vmalloc for larger sizes.
*/
struct ceph_buffer {
struct kref kref;
struct kvec vec;
size_t alloc_len;
- bool is_vmalloc;
};
extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
@@ -31,7 +30,8 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
static inline void ceph_buffer_put(struct ceph_buffer *b)
{
- kref_put(&b->kref, ceph_buffer_release);
+ if (b)
+ kref_put(&b->kref, ceph_buffer_release);
}
extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index aa2e19182d99..5f904591fa5f 100644
--- a/include/linux/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -1,8 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_DEBUG_H
#define _FS_CEPH_DEBUG_H
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/string.h>
+
#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
/*
@@ -12,18 +15,25 @@
*/
# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
# define dout(fmt, ...) \
pr_debug("%.*s %12.12s:%-4d : " fmt, \
8 - (int)sizeof(KBUILD_MODNAME), " ", \
- ceph_file_part(__FILE__, sizeof(__FILE__)), \
- __LINE__, ##__VA_ARGS__)
+ kbasename(__FILE__), __LINE__, ##__VA_ARGS__)
+# define doutc(client, fmt, ...) \
+ pr_debug("%.*s %12.12s:%-4d : [%pU %llu] " fmt, \
+ 8 - (int)sizeof(KBUILD_MODNAME), " ", \
+ kbasename(__FILE__), __LINE__, \
+ &client->fsid, client->monc.auth->global_id, \
+ ##__VA_ARGS__)
# else
/* faux printk call just to see any compiler warnings. */
-# define dout(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
- } while (0)
+# define dout(fmt, ...) \
+ no_printk(KERN_DEBUG fmt, ##__VA_ARGS__)
+# define doutc(client, fmt, ...) \
+ no_printk(KERN_DEBUG "[%pU %llu] " fmt, \
+ &client->fsid, \
+ client->monc.auth->global_id, \
+ ##__VA_ARGS__)
# endif
#else
@@ -32,7 +42,32 @@ extern const char *ceph_file_part(const char *s, int len);
* or, just wrap pr_debug
*/
# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+# define doutc(client, fmt, ...) \
+ pr_debug(" [%pU %llu] %s: " fmt, &client->fsid, \
+ client->monc.auth->global_id, __func__, ##__VA_ARGS__)
#endif
+#define pr_notice_client(client, fmt, ...) \
+ pr_notice("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_info_client(client, fmt, ...) \
+ pr_info("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_client(client, fmt, ...) \
+ pr_warn("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_once_client(client, fmt, ...) \
+ pr_warn_once("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_client(client, fmt, ...) \
+ pr_err("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_ratelimited_client(client, fmt, ...) \
+ pr_warn_ratelimited("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_ratelimited_client(client, fmt, ...) \
+ pr_err_ratelimited("[%pU %llu]: " fmt, &client->fsid, \
+ client->monc.auth->global_id, ##__VA_ARGS__)
+
#endif
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 4c42080347af..3a47acd9cc14 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -1,59 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CEPH_FEATURES
#define __CEPH_FEATURES
/*
- * feature bits
+ * Each time we reclaim bits for reuse we need to specify another bit
+ * that, if present, indicates we have the new incarnation of that
+ * feature. Base case is 1 (first use).
*/
-#define CEPH_FEATURE_UID (1<<0)
-#define CEPH_FEATURE_NOSRCADDR (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
-#define CEPH_FEATURE_FLOCK (1<<3)
-#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
-#define CEPH_FEATURE_MONNAMES (1<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
-#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
-#define CEPH_FEATURE_PGID64 (1<<9)
-#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
-#define CEPH_FEATURE_PGPOOL3 (1<<11)
-#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
-#define CEPH_FEATURE_OSDENC (1<<13)
-#define CEPH_FEATURE_OMAP (1<<14)
-#define CEPH_FEATURE_MONENC (1<<15)
-#define CEPH_FEATURE_QUERY_T (1<<16)
-#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
-#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
-#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
-#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
-#define CEPH_FEATURE_MON_GV (1<<21)
-#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
-#define CEPH_FEATURE_MSG_AUTH (1<<23)
-#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
-#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
-#define CEPH_FEATURE_CREATEPOOLID (1<<26)
-#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
-#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
-#define CEPH_FEATURE_MDSENC (1<<29)
-#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
+ static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/* this bit is ignored but still advertised by release *when* */
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/*
+ * this bit is ignored by release *unused* and not advertised by
+ * release *unadvertised*
+ */
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+/*
+ * test for a feature. this test is safer than a typical mask against
+ * the bit because it ensures that we have the bit AND the marker for the
+ * bit's incarnation. this must be used in any case where the features
+ * bits may include an old meaning of the bit.
+ */
+#define CEPH_HAVE_FEATURE(x, name) \
+ (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel). For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ * - In the first phase we indicate that a feature is DEPRECATED as of
+ * a particular release. This is the first major release X (say,
+ * jewel) that does not depend on its peers advertising the feature.
+ * That is, it safely assumes its peers all have the feature. We
+ * indicate this with the DEPRECATED macro. For example,
+ *
+ * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ * because 10.2.z (jewel) did not care if its peers advertised this
+ * feature bit.
+ *
+ * - In the second phase we stop advertising the bit and call it
+ * RETIRED. This can normally be done in the *next* major release
+ * following the one in which we marked the feature DEPRECATED. In
+ * the above example, for 12.0.z (luminous) we can say:
+ *
+ * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+ *
+ * - The bit can be reused in the first post-luminous release, 13.0.z
+ * (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
+DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
+DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
+DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
+DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
+DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
+DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
+DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
+DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
+DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
+DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
+DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
/*
* Features supported.
*/
-#define CEPH_FEATURES_SUPPORTED_DEFAULT \
+#define CEPH_FEATURES_SUPPORTED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_SERVER_NAUTILUS | \
+ CEPH_FEATURE_FLOCK | \
+ CEPH_FEATURE_SUBSCRIBE2 | \
+ CEPH_FEATURE_MONNAMES | \
CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_DIRLAYOUTHASH | \
CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_MONENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_SERVER_LUMINOUS | \
+ CEPH_FEATURE_RESEND_ON_SPLIT | \
+ CEPH_FEATURE_RADOS_BACKOFF | \
+ CEPH_FEATURE_OSDMAP_PG_UPMAP | \
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \
+ CEPH_FEATURE_MSG_AUTH | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
- CEPH_FEATURE_OSDHASHPSPOOL)
-
-#define CEPH_FEATURES_REQUIRED_DEFAULT \
- (CEPH_FEATURE_NOSRCADDR | \
- CEPH_FEATURE_RECONNECT_SEQ | \
- CEPH_FEATURE_PGID64 | \
- CEPH_FEATURE_PGPOOL3 | \
- CEPH_FEATURE_OSDENC)
+ CEPH_FEATURE_SERVER_MIMIC | \
+ CEPH_FEATURE_MDSENC | \
+ CEPH_FEATURE_OSDHASHPSPOOL | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_EXPORT_PEER | \
+ CEPH_FEATURE_OSDMAP_ENC | \
+ CEPH_FEATURE_MDS_INLINE_DATA | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
+ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
+ CEPH_FEATURE_OSD_POOLRESEND | \
+ CEPH_FEATURE_MDS_QUOTA | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATURE_NEW_OSDOP_ENCODING | \
+ CEPH_FEATURE_SERVER_JEWEL | \
+ CEPH_FEATURE_MON_STATEFUL_SUB | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+ CEPH_FEATURE_MSG_ADDR2 | \
+ CEPH_FEATURE_CEPHX_V2)
+
+#define CEPH_FEATURES_REQUIRED_DEFAULT 0
+
#endif
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index 5babb8e95352..97bab0adc58a 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef FS_CEPH_FRAG_H
#define FS_CEPH_FRAG_H
@@ -40,57 +41,22 @@ static inline __u32 ceph_frag_mask_shift(__u32 f)
return 24 - ceph_frag_bits(f);
}
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+static inline bool ceph_frag_contains_value(__u32 f, __u32 v)
{
return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
- /* is sub as specific as us, and contained by us? */
- return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
- (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline __u32 ceph_frag_parent(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f) - 1,
- ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
- return ceph_frag_bits(f) > 0 &&
- (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
- return ceph_frag_bits(f) > 0 &&
- (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f),
- ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f)+1,
- ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
{
int newbits = ceph_frag_bits(f) + by;
return ceph_frag_make(newbits,
ceph_frag_value(f) | (i << (24 - newbits)));
}
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
{
return ceph_frag_value(f) == 0;
}
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
{
return ceph_frag_value(f) == ceph_frag_mask(f);
}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 2ad7b860f062..c7f2c63b3bc3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ceph_fs.h - Ceph constants and data types to share between kernel and
* user space.
@@ -27,16 +28,16 @@
#define CEPH_INO_ROOT 1
-#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
-#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31
/*
- * ceph_file_layout - describe data layout for a file/inode
+ * legacy ceph_file_layoute
*/
-struct ceph_file_layout {
+struct ceph_file_layout_legacy {
/* file -> object mapping */
__le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
of page size. */
@@ -53,9 +54,26 @@ struct ceph_file_layout {
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed));
-#define CEPH_MIN_STRIPE_UNIT 65536
+struct ceph_string;
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ u32 stripe_unit; /* stripe unit, in bytes */
+ u32 stripe_count; /* over this many objects */
+ u32 object_size; /* until objects are this big */
+ s64 pool_id; /* rados pool id */
+ struct ceph_string __rcu *pool_ns; /* rados pool namespace */
+};
+
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+#define CEPH_MIN_STRIPE_UNIT 65536
struct ceph_dir_layout {
__u8 dl_dir_hash; /* see ceph_hash.h for ids */
@@ -75,8 +93,19 @@ struct ceph_dir_layout {
#define CEPH_AUTH_NONE 0x1
#define CEPH_AUTH_CEPHX 0x2
+#define CEPH_AUTH_MODE_NONE 0
+#define CEPH_AUTH_MODE_AUTHORIZER 1
+#define CEPH_AUTH_MODE_MON 10
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC 0x1
+#define CEPH_CON_MODE_SECURE 0x2
+
#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
/*********************************************
* message layer
@@ -104,6 +133,7 @@ struct ceph_dir_layout {
/* client <-> mds */
#define CEPH_MSG_MDS_MAP 21
+#define CEPH_MSG_FS_MAP_USER 103
#define CEPH_MSG_CLIENT_SESSION 22
#define CEPH_MSG_CLIENT_RECONNECT 23
@@ -111,41 +141,37 @@ struct ceph_dir_layout {
#define CEPH_MSG_CLIENT_REQUEST 24
#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_METRICS 29
#define CEPH_MSG_CLIENT_CAPS 0x310
#define CEPH_MSG_CLIENT_LEASE 0x311
#define CEPH_MSG_CLIENT_SNAP 0x312
#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+#define CEPH_MSG_CLIENT_QUOTA 0x314
/* pool ops */
#define CEPH_MSG_POOLOP_REPLY 48
#define CEPH_MSG_POOLOP 49
+/* mon commands */
+#define CEPH_MSG_MON_COMMAND 50
+#define CEPH_MSG_MON_COMMAND_ACK 51
/* osd */
#define CEPH_MSG_OSD_MAP 41
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
#define CEPH_MSG_WATCH_NOTIFY 44
+#define CEPH_MSG_OSD_BACKOFF 61
/* watch-notify operations */
enum {
- WATCH_NOTIFY = 1, /* notifying watcher */
- WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
};
-/* pool operations */
-enum {
- POOL_OP_CREATE = 0x01,
- POOL_OP_DELETE = 0x02,
- POOL_OP_AUID_CHANGE = 0x03,
- POOL_OP_CREATE_SNAP = 0x11,
- POOL_OP_DELETE_SNAP = 0x12,
- POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
- POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
-};
-
struct ceph_mon_request_header {
__le64 have_version;
__le16 session_mon;
@@ -155,6 +181,8 @@ struct ceph_mon_request_header {
struct ceph_mon_statfs {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
+ __u8 contains_data_pool;
+ __le64 data_pool;
} __attribute__ ((packed));
struct ceph_statfs {
@@ -168,29 +196,12 @@ struct ceph_mon_statfs_reply {
struct ceph_statfs st;
} __attribute__ ((packed));
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 pool;
- __le32 op;
- __le64 auid;
- __le64 snapid;
- __le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
+struct ceph_mon_command {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
- __le32 reply_code;
- __le32 epoch;
- char has_data;
- char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
- __le64 snapid;
+ __le32 num_strs; /* always 1 */
+ __le32 str_len;
+ char str[];
} __attribute__ ((packed));
struct ceph_osd_getmap {
@@ -211,8 +222,8 @@ struct ceph_client_mount {
#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
struct ceph_mon_subscribe_item {
- __le64 have_version; __le64 have;
- __u8 onetime;
+ __le64 start;
+ __u8 flags;
} __attribute__ ((packed));
struct ceph_mon_subscribe_ack {
@@ -220,6 +231,8 @@ struct ceph_mon_subscribe_ack {
struct ceph_fsid fsid;
} __attribute__ ((packed));
+#define CEPH_FS_CLUSTER_ID_NONE -1
+
/*
* mdsmap flags
*/
@@ -282,8 +295,15 @@ enum {
CEPH_SESSION_RENEWCAPS,
CEPH_SESSION_STALE,
CEPH_SESSION_RECALL_STATE,
+ CEPH_SESSION_FLUSHMSG,
+ CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
+ CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG,
};
+#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */
+
extern const char *ceph_session_op_name(int op);
struct ceph_mds_session_head {
@@ -307,6 +327,8 @@ enum {
CEPH_MDS_OP_LOOKUPHASH = 0x00102,
CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
CEPH_MDS_OP_LOOKUPINO = 0x00104,
+ CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+ CEPH_MDS_OP_GETVXATTR = 0x00106,
CEPH_MDS_OP_SETXATTR = 0x01105,
CEPH_MDS_OP_RMXATTR = 0x01106,
@@ -332,24 +354,61 @@ enum {
CEPH_MDS_OP_MKSNAP = 0x01400,
CEPH_MDS_OP_RMSNAP = 0x01401,
CEPH_MDS_OP_LSSNAP = 0x00402,
+ CEPH_MDS_OP_RENAMESNAP = 0x01403,
};
-extern const char *ceph_mds_op_name(int op);
+#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \
+ op == CEPH_MDS_OP_MKNOD || \
+ op == CEPH_MDS_OP_MKDIR || \
+ op == CEPH_MDS_OP_SYMLINK)
+extern const char *ceph_mds_op_name(int op);
-#define CEPH_SETATTR_MODE 1
-#define CEPH_SETATTR_UID 2
-#define CEPH_SETATTR_GID 4
-#define CEPH_SETATTR_MTIME 8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE 32
-#define CEPH_SETATTR_CTIME 64
+#define CEPH_SETATTR_MODE (1 << 0)
+#define CEPH_SETATTR_UID (1 << 1)
+#define CEPH_SETATTR_GID (1 << 2)
+#define CEPH_SETATTR_MTIME (1 << 3)
+#define CEPH_SETATTR_ATIME (1 << 4)
+#define CEPH_SETATTR_SIZE (1 << 5)
+#define CEPH_SETATTR_CTIME (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME (1 << 9)
+#define CEPH_SETATTR_KILL_SGUID (1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12)
/*
* Ceph setxattr request flags.
*/
-#define CEPH_XATTR_CREATE 1
-#define CEPH_XATTR_REPLACE 2
+#define CEPH_XATTR_CREATE (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE (1 << 31)
+
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
+#define CEPH_READDIR_HASH_ORDER (1<<9)
+#define CEPH_READDIR_OFFSET_HASH (1<<10)
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY 00000000
+#define CEPH_O_WRONLY 00000001
+#define CEPH_O_RDWR 00000002
+#define CEPH_O_CREAT 00000100
+#define CEPH_O_EXCL 00000200
+#define CEPH_O_TRUNC 00001000
+#define CEPH_O_DIRECTORY 00200000
+#define CEPH_O_NOFOLLOW 00400000
union ceph_mds_request_args {
struct {
@@ -368,6 +427,8 @@ union ceph_mds_request_args {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
@@ -382,30 +443,53 @@ union ceph_mds_request_args {
__le32 stripe_unit; /* layout for newly created file */
__le32 stripe_count; /* ... */
__le32 object_size;
- __le32 file_replication;
- __le32 unused; /* used to be preferred osd */
+ __le32 pool;
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size;
} __attribute__ ((packed)) open;
struct {
__le32 flags;
+ __le32 osdmap_epoch; /* used for setting file/dir layouts */
} __attribute__ ((packed)) setxattr;
struct {
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
} __attribute__ ((packed)) setlayout;
struct {
__u8 rule; /* currently fcntl or flock */
__u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* owner of the lock */
__le64 pid; /* process id requesting the lock */
- __le64 pid_namespace;
__le64 start; /* initial location to lock */
__le64 length; /* num bytes to lock from start */
__u8 wait; /* will caller wait for lock to become available? */
} __attribute__ ((packed)) filelock_change;
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 snapid;
+ __le64 parent;
+ __le32 hash;
+ } __attribute__ ((packed)) lookupino;
} __attribute__ ((packed));
-#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+union ceph_mds_request_args_ext {
+ union ceph_mds_request_args old;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr_ext;
+};
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_legacy {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
@@ -418,6 +502,28 @@ struct ceph_mds_request_head {
union ceph_mds_request_args args;
} __attribute__ ((packed));
+#define CEPH_MDS_REQUEST_HEAD_VERSION 3
+
+struct ceph_mds_request_head {
+ __le16 version; /* struct version */
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_ext args;
+
+ __le32 ext_num_retry; /* new count retry attempts */
+ __le32 ext_num_fwd; /* new count fwd attempts */
+
+ __le32 struct_len; /* to store size of struct ceph_mds_request_head */
+ __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
+} __attribute__ ((packed));
+
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */
@@ -457,7 +563,8 @@ struct ceph_mds_reply_cap {
__u8 flags; /* CEPH_CAP_FLAG_* */
} __attribute__ ((packed));
-#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */
/* inode record, for bundling with mds reply */
struct ceph_mds_reply_inode {
@@ -467,7 +574,7 @@ struct ceph_mds_reply_inode {
__le64 version; /* inode version */
__le64 xattr_version; /* version for xattr blob */
struct ceph_mds_reply_cap cap; /* caps issued for this inode */
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
struct ceph_timespec ctime, mtime, atime;
__le32 time_warp_seq;
__le64 size, max_size, truncate_size;
@@ -487,6 +594,9 @@ struct ceph_mds_reply_lease {
__le32 seq;
} __attribute__ ((packed));
+#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
+
struct ceph_mds_reply_dirfrag {
__le32 frag; /* fragment */
__le32 auth; /* auth mds, if this is a delegation point */
@@ -494,8 +604,11 @@ struct ceph_mds_reply_dirfrag {
__le32 dist[];
} __attribute__ ((packed));
-#define CEPH_LOCK_FCNTL 1
-#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL 1
+#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL_INTR 3
+#define CEPH_LOCK_FLOCK_INTR 4
+
#define CEPH_LOCK_SHARED 1
#define CEPH_LOCK_EXCL 2
@@ -505,8 +618,8 @@ struct ceph_filelock {
__le64 start;/* file offset to start lock at */
__le64 length; /* num bytes to lock; 0 for all following start */
__le64 client; /* which client holds the lock */
+ __le64 owner; /* owner the lock */
__le64 pid; /* process id holding the lock on the client */
- __le64 pid_namespace;
__u8 type; /* shared lock, exclusive lock, or unlock */
} __attribute__ ((packed));
@@ -517,10 +630,12 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_WR 2
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
-#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+#define CEPH_FILE_MODE_BITS 4
+#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1)
int ceph_flags_to_mode(int flags);
+#define CEPH_INLINE_NONE ((__u64)-1)
/* capability bits */
#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
@@ -585,6 +700,9 @@ int ceph_flags_to_mode(int flags);
CEPH_CAP_LINK_SHARED | \
CEPH_CAP_FILE_SHARED | \
CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
CEPH_CAP_LINK_SHARED | \
@@ -597,16 +715,27 @@ int ceph_flags_to_mode(int flags);
CEPH_CAP_LINK_EXCL | \
CEPH_CAP_XATTR_EXCL | \
CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
CEPH_CAP_FILE_EXCL)
#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+ CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
CEPH_LOCK_IXATTR)
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
int ceph_caps_for_mode(int mode);
enum {
@@ -627,6 +756,11 @@ enum {
extern const char *ceph_cap_op_name(int op);
+/* flags field in client cap messages (version >= 10) */
+#define CEPH_CLIENT_CAPS_SYNC (1<<0)
+#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1)
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2)
+
/*
* caps message, used for capability callbacks, acks, requests, etc.
*/
@@ -650,14 +784,22 @@ struct ceph_mds_caps {
__le32 xattr_len;
__le64 xattr_version;
- /* filelock */
+ /* a union of non-export and export bodies. */
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
struct ceph_timespec mtime, atime, ctime;
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
__le32 time_warp_seq;
} __attribute__ ((packed));
+struct ceph_mds_cap_peer {
+ __le64 cap_id;
+ __le32 issue_seq;
+ __le32 mseq;
+ __le32 mds;
+ __u8 flags;
+} __attribute__ ((packed));
+
/* cap release msg head */
struct ceph_mds_cap_release {
__le32 num; /* number of cap_items that follow */
@@ -666,7 +808,7 @@ struct ceph_mds_cap_release {
struct ceph_mds_cap_item {
__le64 ino;
__le64 cap_id;
- __le32 migrate_seq, seq;
+ __le32 migrate_seq, issue_seq;
} __attribute__ ((packed));
#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
@@ -750,4 +892,20 @@ struct ceph_mds_snap_realm {
} __attribute__ ((packed));
/* followed by my snap list, then prior parent snap list */
+/*
+ * quotas
+ */
+struct ceph_mds_quota {
+ __le64 ino; /* ino */
+ struct ceph_timespec rctime;
+ __le64 rbytes; /* dir stats */
+ __le64 rfiles;
+ __le64 rsubdirs;
+ __u8 struct_v; /* compat */
+ __u8 struct_compat;
+ __le32 struct_len;
+ __le64 max_bytes; /* quota max. bytes */
+ __le64 max_files; /* quota max. files */
+} __attribute__ ((packed));
+
#endif
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
index d099c3f90236..fda474c7a5d6 100644
--- a/include/linux/ceph/ceph_hash.h
+++ b/include/linux/ceph/ceph_hash.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef FS_CEPH_HASH_H
#define FS_CEPH_HASH_H
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
new file mode 100644
index 000000000000..17bc7584d1fe
--- /dev/null
+++ b/include/linux/ceph/cls_lock_client.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CEPH_CLS_LOCK_CLIENT_H
+#define _LINUX_CEPH_CLS_LOCK_CLIENT_H
+
+#include <linux/ceph/osd_client.h>
+
+enum ceph_cls_lock_type {
+ CEPH_CLS_LOCK_NONE = 0,
+ CEPH_CLS_LOCK_EXCLUSIVE = 1,
+ CEPH_CLS_LOCK_SHARED = 2,
+};
+
+struct ceph_locker_id {
+ struct ceph_entity_name name; /* locker's client name */
+ char *cookie; /* locker's cookie */
+};
+
+struct ceph_locker_info {
+ struct ceph_entity_addr addr; /* locker's address */
+};
+
+struct ceph_locker {
+ struct ceph_locker_id id;
+ struct ceph_locker_info info;
+};
+
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *cookie,
+ char *tag, char *desc, u8 flags);
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie);
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie,
+ struct ceph_entity_name *locker);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers);
+
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+ char *lock_name, u8 type, char *cookie, char *tag);
+
+#endif
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
index 1df086d7882d..8b3a1a7a953a 100644
--- a/include/linux/ceph/debugfs.h
+++ b/include/linux/ceph/debugfs.h
@@ -1,32 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_DEBUGFS_H
#define _FS_CEPH_DEBUGFS_H
-#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/types.h>
-#define CEPH_DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- struct seq_file *sf; \
- int ret; \
- \
- ret = single_open(file, name, NULL); \
- sf = file->private_data; \
- sf->private = inode->i_private; \
- return ret; \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
/* debugfs.c */
-extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_init(void);
extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_init(struct ceph_client *client);
extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
#endif
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 0442c3d800f0..8fc1aed64113 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -1,30 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CEPH_DECODE_H
#define __CEPH_DECODE_H
#include <linux/err.h>
#include <linux/bug.h>
+#include <linux/slab.h>
#include <linux/time.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/ceph/types.h>
-/* This seemed to be the easiest place to define these */
-
-#define U8_MAX ((u8)(~0U))
-#define U16_MAX ((u16)(~0U))
-#define U32_MAX ((u32)(~0U))
-#define U64_MAX ((u64)(~0ULL))
-
-#define S8_MAX ((s8)(U8_MAX >> 1))
-#define S16_MAX ((s16)(U16_MAX >> 1))
-#define S32_MAX ((s32)(U32_MAX >> 1))
-#define S64_MAX ((s64)(U64_MAX >> 1LL))
-
-#define S8_MIN ((s8)(-S8_MAX - 1))
-#define S16_MIN ((s16)(-S16_MAX - 1))
-#define S32_MIN ((s32)(-S32_MAX - 1))
-#define S64_MIN ((s64)(-S64_MAX - 1LL))
-
/*
* in all cases,
* void **p pointer to position pointer
@@ -64,7 +49,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
/*
* bounds check input.
*/
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
{
return end >= *p && n <= end - *p;
}
@@ -149,16 +134,82 @@ bad:
}
/*
- * struct ceph_timespec <-> struct timespec
+ * skip helpers
*/
-static inline void ceph_decode_timespec(struct timespec *ts,
- const struct ceph_timespec *tv)
+#define ceph_decode_skip_n(p, end, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ *p += n; \
+ } while (0)
+
+#define ceph_decode_skip_64(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u64), bad)
+
+#define ceph_decode_skip_32(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u32), bad)
+
+#define ceph_decode_skip_16(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u16), bad)
+
+#define ceph_decode_skip_8(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u8), bad)
+
+#define ceph_decode_skip_string(p, end, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ ceph_decode_skip_n(p, end, len, bad); \
+ } while (0)
+
+#define ceph_decode_skip_set(p, end, type, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) \
+ ceph_decode_skip_##type(p, end, bad); \
+ } while (0)
+
+#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype(p, end, bad); \
+ ceph_decode_skip_##vtype(p, end, bad); \
+ } \
+ } while (0)
+
+#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype1(p, end, bad); \
+ ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
+ } \
+ } while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec64
+ */
+static inline void ceph_decode_timespec64(struct timespec64 *ts,
+ const struct ceph_timespec *tv)
{
- ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec);
+ /*
+ * This will still overflow in year 2106. We could extend
+ * the protocol to steal two more bits from tv_nsec to
+ * add three more 136 year epochs after that the way ext4
+ * does if necessary.
+ */
+ ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec);
ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec);
}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
- const struct timespec *ts)
+static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
+ const struct timespec64 *ts)
{
tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
@@ -167,18 +218,35 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
/*
* sockaddr_storage <-> ceph_sockaddr
*/
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+#define CEPH_ENTITY_ADDR_TYPE_NONE 0
+#define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3)
+
+static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
{
__be16 ss_family = htons(a->in_addr.ss_family);
a->in_addr.ss_family = *(__u16 *)&ss_family;
+
+ /* Banner addresses require TYPE_NONE */
+ a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
{
__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
a->in_addr.ss_family = ntohs(ss_family);
WARN_ON(a->in_addr.ss_family == 512);
+ a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
}
+extern int ceph_decode_entity_addr(void **p, void *end,
+ struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr);
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
/*
* encoders
*/
@@ -234,6 +302,60 @@ static inline void ceph_encode_string(void **p, void *end,
*p += len;
}
+/*
+ * version and length starting block encoders/decoders
+ */
+
+/* current code version (u8) + compat code version (u8) + len of struct (u32) */
+#define CEPH_ENCODING_START_BLK_LEN 6
+
+/**
+ * ceph_start_encoding - start encoding block
+ * @struct_v: current (code) version of the encoding
+ * @struct_compat: oldest code version that can decode it
+ * @struct_len: length of struct encoding
+ */
+static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat,
+ u32 struct_len)
+{
+ ceph_encode_8(p, struct_v);
+ ceph_encode_8(p, struct_compat);
+ ceph_encode_32(p, struct_len);
+}
+
+/**
+ * ceph_start_decoding - start decoding block
+ * @v: current version of the encoding that the code supports
+ * @name: name of the struct (free-form)
+ * @struct_v: out param for the encoding version
+ * @struct_len: out param for the length of struct encoding
+ *
+ * Validates the length of struct encoding, so unsafe ceph_decode_*
+ * variants can be used for decoding.
+ */
+static inline int ceph_start_decoding(void **p, void *end, u8 v,
+ const char *name, u8 *struct_v,
+ u32 *struct_len)
+{
+ u8 struct_compat;
+
+ ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad);
+ *struct_v = ceph_decode_8(p);
+ struct_compat = ceph_decode_8(p);
+ if (v < struct_compat) {
+ pr_warn("got struct_v %d struct_compat %d > %d of %s\n",
+ *struct_v, struct_compat, v, name);
+ return -EINVAL;
+ }
+
+ *struct_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, *struct_len, bad);
+ return 0;
+
+bad:
+ return -ERANGE;
+}
+
#define ceph_encode_need(p, end, n, bad) \
do { \
if (!likely(ceph_has_room(p, end, n))) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 2e3024881a5e..63e0e2aa1ce9 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_LIBCEPH_H
#define _FS_CEPH_LIBCEPH_H
#include <linux/ceph/ceph_debug.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/backing-dev.h>
#include <linux/completion.h>
#include <linux/exportfs.h>
@@ -14,6 +15,7 @@
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
@@ -21,6 +23,7 @@
#include <linux/ceph/mon_client.h>
#include <linux/ceph/osd_client.h>
#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
/*
* mount options
@@ -28,9 +31,13 @@
#define CEPH_OPT_FSID (1<<0)
#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
-#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
+#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes (msgr1) */
+#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */
+#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */
+#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */
-#define CEPH_OPT_DEFAULT (0)
+#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
#define ceph_set_opt(client, opt) \
(client)->options->flags |= CEPH_OPT_##opt;
@@ -41,12 +48,15 @@ struct ceph_options {
int flags;
struct ceph_fsid fsid;
struct ceph_entity_addr my_addr;
- int mount_timeout;
- int osd_idle_ttl;
- int osd_keepalive_timeout;
+ unsigned long mount_timeout; /* jiffies */
+ unsigned long osd_idle_ttl; /* jiffies */
+ unsigned long osd_keepalive_timeout; /* jiffies */
+ unsigned long osd_request_timeout; /* jiffies */
+ u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+ int con_modes[2]; /* CEPH_CON_MODE_* */
/*
- * any type that can't be simply compared or doesn't need need
+ * any type that can't be simply compared or doesn't need
* to be compared should go beyond this point,
* ceph_compare_options() should be updated accordingly
*/
@@ -56,48 +66,42 @@ struct ceph_options {
int num_mon;
char *name;
struct ceph_crypto_key *key;
+ struct rb_root crush_locs;
};
/*
* defaults
*/
-#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
-#define CEPH_OSD_KEEPALIVE_DEFAULT 5
-#define CEPH_OSD_IDLE_TTL_DEFAULT 60
-
+#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
+#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */
+
+#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
+#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
+#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000)
+#define CEPH_MONC_HUNT_BACKOFF 2
+#define CEPH_MONC_HUNT_MAX_MULT 10
+
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
-
-#define CEPH_AUTH_NAME_DEFAULT "guest"
/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file. Delay a minimum amount of time, even if we send a cap
- * message for some other reason. Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
+ * The largest possible rbd data object is 32M.
+ * The largest possible rbd object map object is 64M.
+ *
+ * There is no limit on the size of cephfs objects, but it has to obey
+ * rsize and wsize mount options anyway.
*/
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
- CEPH_MOUNT_MOUNTING,
- CEPH_MOUNT_MOUNTED,
- CEPH_MOUNT_UNMOUNTING,
- CEPH_MOUNT_UNMOUNTED,
- CEPH_MOUNT_SHUTDOWN,
-};
+#define CEPH_MSG_MAX_DATA_LEN (64*1024*1024)
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
+#define CEPH_AUTH_NAME_DEFAULT "guest"
+
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
{
- BUG_ON(time_after(b, a));
- return (long)a - (long)b;
+ return timeout ?: MAX_SCHEDULE_TIMEOUT;
}
struct ceph_mds_client;
@@ -122,8 +126,8 @@ struct ceph_client {
int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
- u32 supported_features;
- u32 required_features;
+ u64 supported_features;
+ u64 required_features;
struct ceph_messenger msgr; /* messenger instance */
struct ceph_mon_client monc;
@@ -133,10 +137,16 @@ struct ceph_client {
struct dentry *debugfs_dir;
struct dentry *debugfs_monmap;
struct dentry *debugfs_osdmap;
+ struct dentry *debugfs_options;
#endif
};
+#define from_msgr(ms) container_of(ms, struct ceph_client, msgr)
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+ return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
/*
* snapshots
@@ -151,7 +161,7 @@ struct ceph_client {
* dirtied.
*/
struct ceph_snap_context {
- atomic_t nref;
+ refcount_t nref;
u64 seq;
u32 num_snaps;
u64 snaps[];
@@ -169,58 +179,146 @@ extern void ceph_put_snap_context(struct ceph_snap_context *sc);
*/
static inline int calc_pages_for(u64 off, u64 len)
{
- return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
- (off >> PAGE_CACHE_SHIFT);
+ return ((off+len+PAGE_SIZE-1) >> PAGE_SHIFT) -
+ (off >> PAGE_SHIFT);
}
-/* ceph_common.c */
-extern bool libceph_compatible(void *data);
+#define RB_BYVAL(a) (a)
+#define RB_BYPTR(a) (&(a))
+#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
+
+#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
+static bool __insert_##name(struct rb_root *root, type *t) \
+{ \
+ struct rb_node **n = &root->rb_node; \
+ struct rb_node *parent = NULL; \
+ \
+ BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
+ \
+ while (*n) { \
+ type *cur = rb_entry(*n, type, nodefld); \
+ int cmp; \
+ \
+ parent = *n; \
+ cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \
+ if (cmp < 0) \
+ n = &(*n)->rb_left; \
+ else if (cmp > 0) \
+ n = &(*n)->rb_right; \
+ else \
+ return false; \
+ } \
+ \
+ rb_link_node(&t->nodefld, parent, n); \
+ rb_insert_color(&t->nodefld, root); \
+ return true; \
+} \
+static void __maybe_unused insert_##name(struct rb_root *root, type *t) \
+{ \
+ if (!__insert_##name(root, t)) \
+ BUG(); \
+} \
+static void erase_##name(struct rb_root *root, type *t) \
+{ \
+ BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
+ rb_erase(&t->nodefld, root); \
+ RB_CLEAR_NODE(&t->nodefld); \
+}
+
+/*
+ * @lookup_param_type is a parameter and not constructed from (@type,
+ * @keyfld) with typeof() because adding const is too unwieldy.
+ */
+#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+static type *lookup_##name(struct rb_root *root, lookup_param_type key) \
+{ \
+ struct rb_node *n = root->rb_node; \
+ \
+ while (n) { \
+ type *cur = rb_entry(n, type, nodefld); \
+ int cmp; \
+ \
+ cmp = cmpexp(key, keyexp(cur->keyfld)); \
+ if (cmp < 0) \
+ n = n->rb_left; \
+ else if (cmp > 0) \
+ n = n->rb_right; \
+ else \
+ return cur; \
+ } \
+ \
+ return NULL; \
+}
+
+#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld)
+
+/*
+ * Shorthands for integer keys.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
+extern type __lookup_##name##_key; \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \
+ typeof(__lookup_##name##_key.keyfld), nodefld)
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_snap_cachep;
+extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
+extern struct kmem_cache *ceph_dir_file_cachep;
+extern struct kmem_cache *ceph_mds_request_cachep;
+extern mempool_t *ceph_wb_pagevec_pool;
-extern struct ceph_options *ceph_parse_options(char *options,
- const char *dev_name, const char *dev_name_end,
- int (*parse_extra_token)(char *c, void *private),
- void *private);
+/* ceph_common.c */
+extern bool libceph_compatible(void *data);
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid);
+
+struct fs_parameter;
+struct fc_log;
+struct ceph_options *ceph_alloc_options(void);
+int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
+ struct fc_log *l, char delim);
+int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
+ struct fc_log *l);
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
+ bool show_all);
extern void ceph_destroy_options(struct ceph_options *opt);
extern int ceph_compare_options(struct ceph_options *new_opt,
struct ceph_client *client);
-extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
- void *private,
- unsigned supported_features,
- unsigned required_features);
-extern u64 ceph_client_id(struct ceph_client *client);
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
+u64 ceph_client_gid(struct ceph_client *client);
extern void ceph_destroy_client(struct ceph_client *client);
-extern int __ceph_open_session(struct ceph_client *client,
- unsigned long started);
+extern void ceph_reset_client_addr(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client);
extern int ceph_open_session(struct ceph_client *client);
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+ unsigned long timeout);
/* pagevec.c */
extern void ceph_release_page_vector(struct page **pages, int num_pages);
-
-extern struct page **ceph_get_direct_page_vector(const void __user *data,
- int num_pages,
- bool write_page);
extern void ceph_put_page_vector(struct page **pages, int num_pages,
bool dirty);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
-extern int ceph_copy_user_to_page_vector(struct page **pages,
- const void __user *data,
- loff_t off, size_t len);
-extern void ceph_copy_to_page_vector(struct page **pages,
- const void *data,
- loff_t off, size_t len);
extern void ceph_copy_from_page_vector(struct page **pages,
void *data,
loff_t off, size_t len);
-extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
- loff_t off, size_t len);
extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
deleted file mode 100644
index 87ed09f54800..000000000000
--- a/include/linux/ceph/mdsmap.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include <linux/bug.h>
-#include <linux/ceph/types.h>
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
- u64 global_id;
- struct ceph_entity_addr addr;
- s32 state;
- int num_export_targets;
- bool laggy;
- u32 *export_targets;
-};
-
-struct ceph_mdsmap {
- u32 m_epoch, m_client_epoch, m_last_failure;
- u32 m_root;
- u32 m_session_timeout; /* seconds */
- u32 m_session_autoclose; /* seconds */
- u64 m_max_file_size;
- u32 m_max_mds; /* size of m_addr, m_state arrays */
- struct ceph_mds_info *m_info;
-
- /* which object pools file data can be stored in */
- int m_num_data_pg_pools;
- u64 *m_data_pg_pools;
- u64 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
- if (w >= m->m_max_mds)
- return NULL;
- return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
- BUG_ON(w < 0);
- if (w >= m->m_max_mds)
- return CEPH_MDS_STATE_DNE;
- return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
- if (w >= 0 && w < m->m_max_mds)
- return m->m_info[w].laggy;
- return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 7c1420bb1dce..6aa4c6478c9f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -1,18 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_CEPH_MESSENGER_H
#define __FS_CEPH_MESSENGER_H
+#include <crypto/sha2.h>
+#include <linux/bvec.h>
+#include <linux/crypto.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/net.h>
#include <linux/radix-tree.h>
#include <linux/uio.h>
#include <linux/workqueue.h>
+#include <net/net_namespace.h>
#include <linux/ceph/types.h>
#include <linux/ceph/buffer.h>
struct ceph_msg;
struct ceph_connection;
+struct ceph_msg_data_cursor;
/*
* Ceph defines these callbacks for handling connection events.
@@ -28,7 +34,10 @@ struct ceph_connection_operations {
struct ceph_auth_handshake *(*get_authorizer) (
struct ceph_connection *con,
int *proto, int force_new);
- int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+ int (*add_authorizer_challenge)(struct ceph_connection *con,
+ void *challenge_buf,
+ int challenge_buf_len);
+ int (*verify_authorizer_reply) (struct ceph_connection *con);
int (*invalidate_authorizer)(struct ceph_connection *con);
/* there was some error on the socket (disconnect, whatever) */
@@ -41,9 +50,55 @@ struct ceph_connection_operations {
struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip);
+
+ void (*reencode_message) (struct ceph_msg *msg);
+
+ int (*sign_message) (struct ceph_msg *msg);
+ int (*check_message_signature) (struct ceph_msg *msg);
+
+ /* msgr2 authentication exchange */
+ int (*get_auth_request)(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_reply_more)(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_done)(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+ int (*handle_auth_bad_method)(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
+ /**
+ * sparse_read: read sparse data
+ * @con: connection we're reading from
+ * @cursor: data cursor for reading extents
+ * @buf: optional buffer to read into
+ *
+ * This should be called more than once, each time setting up to
+ * receive an extent into the current cursor position, and zeroing
+ * the holes between them.
+ *
+ * Returns amount of data to be read (in bytes), 0 if reading is
+ * complete, or -errno if there was an error.
+ *
+ * If @buf is set on a >0 return, then the data should be read into
+ * the provided buffer. Otherwise, it should be read into the cursor.
+ *
+ * The sparse read operation is expected to initialize the cursor
+ * with a length covering up to the end of the last extent.
+ */
+ int (*sparse_read)(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor,
+ char **buf);
+
};
-/* use format string %s%d */
+/* use format string %s%lld */
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger {
@@ -51,7 +106,7 @@ struct ceph_messenger {
struct ceph_entity_addr my_enc_addr;
atomic_t stopping;
- bool nocrc;
+ possible_net_t net;
/*
* the global_seq counts connections i (attempt to) initiate
@@ -59,9 +114,6 @@ struct ceph_messenger {
*/
u32 global_seq;
spinlock_t global_seq_lock;
-
- u32 supported_features;
- u32 required_features;
};
enum ceph_msg_data_type {
@@ -71,58 +123,125 @@ enum ceph_msg_data_type {
#ifdef CONFIG_BLOCK
CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
#endif /* CONFIG_BLOCK */
+ CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
+ CEPH_MSG_DATA_ITER, /* data source/destination is an iov_iter */
};
-static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
-{
- switch (type) {
- case CEPH_MSG_DATA_NONE:
- case CEPH_MSG_DATA_PAGES:
- case CEPH_MSG_DATA_PAGELIST:
#ifdef CONFIG_BLOCK
- case CEPH_MSG_DATA_BIO:
+
+struct ceph_bio_iter {
+ struct bio *bio;
+ struct bvec_iter iter;
+};
+
+#define __ceph_bio_iter_advance_step(it, n, STEP) do { \
+ unsigned int __n = (n), __cur_n; \
+ \
+ while (__n) { \
+ BUG_ON(!(it)->iter.bi_size); \
+ __cur_n = min((it)->iter.bi_size, __n); \
+ (void)(STEP); \
+ bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \
+ if (!(it)->iter.bi_size && (it)->bio->bi_next) { \
+ dout("__ceph_bio_iter_advance_step next bio\n"); \
+ (it)->bio = (it)->bio->bi_next; \
+ (it)->iter = (it)->bio->bi_iter; \
+ } \
+ __n -= __cur_n; \
+ } \
+} while (0)
+
+/*
+ * Advance @it by @n bytes.
+ */
+#define ceph_bio_iter_advance(it, n) \
+ __ceph_bio_iter_advance_step(it, n, 0)
+
+/*
+ * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
+ */
+#define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \
+ __ceph_bio_iter_advance_step(it, n, ({ \
+ struct bio_vec bv; \
+ struct bvec_iter __cur_iter; \
+ \
+ __cur_iter = (it)->iter; \
+ __cur_iter.bi_size = __cur_n; \
+ __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \
+ (void)(BVEC_STEP); \
+ }))
+
#endif /* CONFIG_BLOCK */
- return true;
- default:
- return false;
- }
-}
+
+struct ceph_bvec_iter {
+ struct bio_vec *bvecs;
+ struct bvec_iter iter;
+};
+
+#define __ceph_bvec_iter_advance_step(it, n, STEP) do { \
+ BUG_ON((n) > (it)->iter.bi_size); \
+ (void)(STEP); \
+ bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \
+} while (0)
+
+/*
+ * Advance @it by @n bytes.
+ */
+#define ceph_bvec_iter_advance(it, n) \
+ __ceph_bvec_iter_advance_step(it, n, 0)
+
+/*
+ * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
+ */
+#define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \
+ __ceph_bvec_iter_advance_step(it, n, ({ \
+ struct bio_vec bv; \
+ struct bvec_iter __cur_iter; \
+ \
+ __cur_iter = (it)->iter; \
+ __cur_iter.bi_size = (n); \
+ for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \
+ (void)(BVEC_STEP); \
+ }))
+
+#define ceph_bvec_iter_shorten(it, n) do { \
+ BUG_ON((n) > (it)->iter.bi_size); \
+ (it)->iter.bi_size = (n); \
+} while (0)
struct ceph_msg_data {
- struct list_head links; /* ceph_msg->data */
enum ceph_msg_data_type type;
union {
#ifdef CONFIG_BLOCK
struct {
- struct bio *bio;
- size_t bio_length;
+ struct ceph_bio_iter bio_pos;
+ u32 bio_length;
};
#endif /* CONFIG_BLOCK */
+ struct ceph_bvec_iter bvec_pos;
struct {
- struct page **pages; /* NOT OWNER. */
+ struct page **pages;
size_t length; /* total # bytes */
unsigned int alignment; /* first page */
+ bool own_pages;
};
struct ceph_pagelist *pagelist;
+ struct iov_iter iter;
};
};
struct ceph_msg_data_cursor {
size_t total_resid; /* across all data items */
- struct list_head *data_head; /* = &ceph_msg->data */
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
- bool last_piece; /* current is last piece */
+ int sr_resid; /* residual sparse_read len */
bool need_crc; /* crc update needed */
union {
#ifdef CONFIG_BLOCK
- struct { /* bio */
- struct bio *bio; /* bio from list */
- unsigned int vector_index; /* vector from bio */
- unsigned int vector_offset; /* bytes from vector */
- };
+ struct ceph_bio_iter bio_iter;
#endif /* CONFIG_BLOCK */
+ struct bvec_iter bvec_iter;
struct { /* pages */
unsigned int page_offset; /* offset in page */
unsigned short page_index; /* index in array */
@@ -132,6 +251,10 @@ struct ceph_msg_data_cursor {
struct page *page; /* page from list */
size_t offset; /* bytes from list */
};
+ struct {
+ struct iov_iter iov_iter;
+ unsigned int lastlen;
+ };
};
};
@@ -142,30 +265,202 @@ struct ceph_msg_data_cursor {
*/
struct ceph_msg {
struct ceph_msg_header hdr; /* header */
- struct ceph_msg_footer footer; /* footer */
+ union {
+ struct ceph_msg_footer footer; /* footer */
+ struct ceph_msg_footer_old old_footer; /* old format footer */
+ };
struct kvec front; /* unaligned blobs of message */
struct ceph_buffer *middle;
size_t data_length;
- struct list_head data;
+ struct ceph_msg_data *data;
+ int num_data_items;
+ int max_data_items;
struct ceph_msg_data_cursor cursor;
struct ceph_connection *con;
struct list_head list_head; /* links for connection lists */
struct kref kref;
- bool front_is_vmalloc;
bool more_to_follow;
bool needs_out_seq;
- int front_max;
- unsigned long ack_stamp; /* tx: when we were acked */
+ u64 sparse_read_total;
+ int front_alloc_len;
struct ceph_msgpool *pool;
};
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED 1
+#define CEPH_CON_S_PREOPEN 2
+#define CEPH_CON_S_V1_BANNER 3
+#define CEPH_CON_S_V1_CONNECT_MSG 4
+#define CEPH_CON_S_V2_BANNER_PREFIX 5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD 6
+#define CEPH_CON_S_V2_HELLO 7
+#define CEPH_CON_S_V2_AUTH 8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE 9
+#define CEPH_CON_S_V2_SESSION_CONNECT 10
+#define CEPH_CON_S_V2_SESSION_RECONNECT 11
+#define CEPH_CON_S_OPEN 12
+#define CEPH_CON_S_STANDBY 13
+
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop
+ messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed
+ work */
+
/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL (HZ/2)
-#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL (HZ / 4)
+#define MAX_DELAY_INTERVAL (15 * HZ)
+
+struct ceph_connection_v1_info {
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_more; /* there is more data after the kvecs */
+ bool out_msg_done;
+
+ struct ceph_auth_handshake *auth;
+ int auth_retry; /* true if we need a newer authorizer */
+
+ /* connection negotiation temps */
+ u8 in_banner[CEPH_BANNER_MAX_LEN];
+ struct ceph_entity_addr actual_peer_addr;
+ struct ceph_entity_addr peer_addr_for_me;
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+
+ int in_base_pos; /* bytes read */
+
+ /* sparse reads */
+ struct kvec in_sr_kvec; /* current location to receive into */
+ u64 in_sr_len; /* amount of data in this extent */
+
+ /* message in temps */
+ u8 in_tag; /* protocol control byte */
+ struct ceph_msg_header in_hdr;
+ __le64 in_temp_ack; /* for reading an ack */
+
+ /* message out temps */
+ struct ceph_msg_header out_hdr;
+ __le64 out_temp_ack; /* for writing an ack */
+ struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
+ stamp */
+
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this session */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+};
+
+#define CEPH_CRC_LEN 4
+#define CEPH_GCM_KEY_LEN 16
+#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN 16
+#define CEPH_GCM_TAG_LEN 16
+
+#define CEPH_PREAMBLE_LEN 32
+#define CEPH_PREAMBLE_INLINE_LEN 48
+#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \
+ CEPH_PREAMBLE_INLINE_LEN + \
+ CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT 4
+
+struct ceph_frame_desc {
+ int fd_tag; /* FRAME_TAG_* */
+ int fd_seg_cnt;
+ int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */
+ int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+ __le32 fixed;
+ __le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+ struct iov_iter in_iter;
+ struct kvec in_kvecs[5]; /* recvmsg */
+ struct bio_vec in_bvec; /* recvmsg (in_cursor) */
+ int in_kvec_cnt;
+ int in_state; /* IN_S_* */
+
+ struct iov_iter out_iter;
+ struct kvec out_kvecs[8]; /* sendmsg */
+ struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero),
+ sendmsg (out_enc_pages) */
+ int out_kvec_cnt;
+ int out_state; /* OUT_S_* */
+
+ int out_zero; /* # of zero bytes to send */
+ bool out_iter_sendpage; /* use sendpage if possible */
+
+ struct ceph_frame_desc in_desc;
+ struct ceph_msg_data_cursor in_cursor;
+ struct ceph_msg_data_cursor out_cursor;
+
+ struct hmac_sha256_key hmac_key; /* post-auth signature */
+ bool hmac_key_set;
+ struct crypto_aead *gcm_tfm; /* on-wire encryption */
+ struct aead_request *gcm_req;
+ struct crypto_wait gcm_wait;
+ struct ceph_gcm_nonce in_gcm_nonce;
+ struct ceph_gcm_nonce out_gcm_nonce;
+
+ struct page **in_enc_pages;
+ int in_enc_page_cnt;
+ int in_enc_resid;
+ int in_enc_i;
+ struct page **out_enc_pages;
+ int out_enc_page_cnt;
+ int out_enc_resid;
+ int out_enc_i;
+
+ int con_mode; /* CEPH_CON_MODE_* */
+
+ void *conn_bufs[16];
+ int conn_buf_cnt;
+ int data_len_remain;
+
+ struct kvec in_sign_kvecs[8];
+ struct kvec out_sign_kvecs[8];
+ int in_sign_kvec_cnt;
+ int out_sign_kvec_cnt;
+
+ u64 client_cookie;
+ u64 server_cookie;
+ u64 global_seq;
+ u64 connect_seq;
+ u64 peer_global_seq;
+
+ u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+ u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+ struct {
+ u8 late_status; /* FRAME_LATE_STATUS_* */
+ union {
+ struct {
+ u32 front_crc;
+ u32 middle_crc;
+ u32 data_crc;
+ } __packed;
+ u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+ };
+ } out_epil;
+};
/*
* A single connection with another host.
@@ -181,25 +476,16 @@ struct ceph_connection {
struct ceph_messenger *msgr;
+ int state; /* CEPH_CON_S_* */
atomic_t sock_state;
struct socket *sock;
- struct ceph_entity_addr peer_addr; /* peer address */
- struct ceph_entity_addr peer_addr_for_me;
- unsigned long flags;
- unsigned long state;
+ unsigned long flags; /* CEPH_CON_F_* */
const char *error_msg; /* error message, if any */
struct ceph_entity_name peer_name; /* peer name */
-
- unsigned peer_features;
- u32 connect_seq; /* identify the most recent connection
- attempt for this connection, client */
- u32 peer_global_seq; /* peer's global seq for this connection */
-
- int auth_retry; /* true if we need a newer authorizer */
- void *auth_reply_buf; /* where to put the authorizer reply */
- int auth_reply_buf_len;
+ struct ceph_entity_addr peer_addr; /* peer address */
+ u64 peer_features;
struct mutex mutex;
@@ -210,55 +496,95 @@ struct ceph_connection {
u64 in_seq, in_seq_acked; /* last message received, acked */
- /* connection negotiation temps */
- char in_banner[CEPH_BANNER_MAX_LEN];
- struct ceph_msg_connect out_connect;
- struct ceph_msg_connect_reply in_reply;
- struct ceph_entity_addr actual_peer_addr;
-
- /* message out temps */
+ struct ceph_msg *in_msg;
struct ceph_msg *out_msg; /* sending message (== tail of
out_sent) */
- bool out_msg_done;
- struct kvec out_kvec[8], /* sending header/footer data */
- *out_kvec_cur;
- int out_kvec_left; /* kvec's left in out_kvec */
- int out_skip; /* skip this many bytes */
- int out_kvec_bytes; /* total bytes left */
- bool out_kvec_is_msg; /* kvec refers to out_msg */
- int out_more; /* there is more data after the kvecs */
- __le64 out_temp_ack; /* for writing an ack */
-
- /* message in temps */
- struct ceph_msg_header in_hdr;
- struct ceph_msg *in_msg;
+ struct page *bounce_page;
u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
- char in_tag; /* protocol control byte */
- int in_base_pos; /* bytes read */
- __le64 in_temp_ack; /* for reading an ack */
+ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
struct delayed_work work; /* send|recv work */
unsigned long delay; /* current delay interval */
+
+ union {
+ struct ceph_connection_v1_info v1;
+ struct ceph_connection_v2_info v2;
+ };
};
+extern struct page *ceph_zero_page;
+
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip);
+struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con);
+
+/* messenger_v1.c */
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
+
+extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
-extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
extern int ceph_parse_ips(const char *c, const char *end,
struct ceph_entity_addr *addr,
- int max_count, int *count);
-
+ int max_count, int *count, char delim);
extern int ceph_msgr_init(void);
extern void ceph_msgr_exit(void);
extern void ceph_msgr_flush(void);
extern void ceph_messenger_init(struct ceph_messenger *msgr,
- struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features,
- bool nocrc);
+ struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_fini(struct ceph_messenger *msgr);
+extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
extern void ceph_con_init(struct ceph_connection *con, void *private,
const struct ceph_connection_operations *ops,
@@ -274,31 +600,29 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+ unsigned long interval);
-extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
- size_t length, size_t alignment);
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment, bool own_pages);
extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
struct ceph_pagelist *pagelist);
#ifdef CONFIG_BLOCK
-extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
- size_t length);
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
+ u32 length);
#endif /* CONFIG_BLOCK */
+void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
+ struct ceph_bvec_iter *bvec_pos);
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+ struct iov_iter *iter);
+struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
+ gfp_t flags, bool can_fail);
extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
bool can_fail);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
- kref_get(&msg->kref);
- return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
- kref_put(&msg->kref, ceph_msg_last_put);
-}
+
+extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg);
+extern void ceph_msg_put(struct ceph_msg *msg);
extern void ceph_msg_dump(struct ceph_msg *msg);
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index a486f390dfbe..7a9a40163c0f 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_MON_CLIENT_H
#define _FS_CEPH_MON_CLIENT_H
@@ -18,7 +19,7 @@ struct ceph_monmap {
struct ceph_fsid fsid;
u32 epoch;
u32 num_mon;
- struct ceph_entity_inst mon_inst[0];
+ struct ceph_entity_inst mon_inst[] __counted_by(num_mon);
};
struct ceph_mon_client;
@@ -39,21 +40,31 @@ struct ceph_mon_request {
ceph_monc_request_func_t do_request;
};
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
+ * ceph_mon_generic_request is being used for the statfs and
+ * mon_get_version requests which are being done a bit differently
+ * because we need to get data back to the caller
*/
struct ceph_mon_generic_request {
+ struct ceph_mon_client *monc;
struct kref kref;
u64 tid;
struct rb_node node;
int result;
- void *buf;
- int buf_len;
+
struct completion completion;
+ ceph_monc_callback_t complete_cb;
+ u64 private_data; /* r_tid/linger_id */
+
struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */
+
+ union {
+ struct ceph_statfs *st;
+ u64 newest;
+ } u;
};
struct ceph_mon_client {
@@ -69,53 +80,73 @@ struct ceph_mon_client {
bool hunting;
int cur_mon; /* last monitor i contacted */
- unsigned long sub_sent, sub_renew_after;
+ unsigned long sub_renew_after;
+ unsigned long sub_renew_sent;
struct ceph_connection con;
+ bool had_a_connection;
+ int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */
+
/* pending generic requests */
struct rb_root generic_request_tree;
- int num_generic_requests;
u64 last_tid;
- /* mds/osd map */
- int want_mdsmap;
- int want_next_osdmap; /* 1 = want, 2 = want+asked */
- u32 have_osdmap, have_mdsmap;
+ /* subs, indexed with CEPH_SUB_* */
+ struct {
+ struct ceph_mon_subscribe_item item;
+ bool want;
+ u32 have; /* epoch */
+ } subs[4];
+ int fs_cluster_id; /* "mdsmap.<id>" sub */
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file;
#endif
};
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
extern int ceph_monmap_contains(struct ceph_monmap *m,
struct ceph_entity_addr *addr);
extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc);
+extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
+
+enum {
+ CEPH_SUB_MONMAP = 0,
+ CEPH_SUB_OSDMAP,
+ CEPH_SUB_FSMAP,
+ CEPH_SUB_MDSMAP,
+};
+
+extern const char *ceph_sub_str[];
/*
* The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map. We will
+ * epoch @epoch, and also call in when we receive a map. We will
* periodically rerequest the map from the monitor cluster until we
* get what we want.
*/
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+ bool continuous);
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+ unsigned long timeout);
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
- struct ceph_statfs *buf);
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf);
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data);
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr);
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 *snapid);
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 snapid);
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
#endif
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index 4b0d38960726..729cdf700eae 100644
--- a/include/linux/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
@@ -1,8 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_MSGPOOL
#define _FS_CEPH_MSGPOOL
#include <linux/mempool.h>
-#include <linux/ceph/messenger.h>
/*
* we use memory pools for preallocating messages we may receive, to
@@ -13,14 +13,15 @@ struct ceph_msgpool {
mempool_t *pool;
int type; /* preallocated message type */
int front_len; /* preallocated payload size */
+ int max_data_items;
};
-extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
- int front_len, int size, bool blocking,
- const char *name);
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
+ int front_len, int max_data_items, int size,
+ const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
- int front_len);
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
+ int max_data_items);
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
#endif
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 3d94a73b5f30..3989dcb94d3d 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef CEPH_MSGR_H
#define CEPH_MSGR_H
@@ -8,24 +9,45 @@
#define CEPH_MON_PORT 6789 /* default monitor port */
/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST 6789
-#define CEPH_PORT_START 6800 /* non-monitors start here */
-#define CEPH_PORT_LAST 6900
-
-/*
* tcp connection banner. include a protocol version. and adjust
* whenever the wire protocol changes. try to keep this string length
* constant.
*/
#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
#define CEPH_BANNER_MAX_LEN 30
/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \
+ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+ (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
+/*
* Rollover-safe type and comparator for 32-bit sequence numbers.
* Comparator returns -1, 0, or 1.
*/
@@ -60,11 +82,18 @@ extern const char *ceph_entity_type_name(int type);
* entity_addr -- network address
*/
struct ceph_entity_addr {
- __le32 type;
+ __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */
__le32 nonce; /* unique id for process (e.g. pid) */
struct sockaddr_storage in_addr;
} __attribute__ ((packed));
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+ const struct ceph_entity_addr *rhs)
+{
+ return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+ lhs->nonce == rhs->nonce;
+}
+
struct ceph_entity_inst {
struct ceph_entity_name name;
struct ceph_entity_addr addr;
@@ -84,11 +113,13 @@ struct ceph_entity_inst {
#define CEPH_MSGR_TAG_MSG 7 /* message */
#define CEPH_MSGR_TAG_ACK 8 /* message ack */
#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
-
+#define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */
/*
* connection negotiation
@@ -152,10 +183,29 @@ struct ceph_msg_header {
receiver: mask against ~PAGE_MASK */
struct ceph_entity_name src;
- __le32 reserved;
+ __le16 compat_version;
+ __le16 reserved;
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
+struct ceph_msg_header2 {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 data_pre_padding_len;
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __le64 ack_seq;
+ __u8 flags;
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+} __attribute__ ((packed));
+
#define CEPH_MSG_PRIO_LOW 64
#define CEPH_MSG_PRIO_DEFAULT 127
#define CEPH_MSG_PRIO_HIGH 196
@@ -164,13 +214,21 @@ struct ceph_msg_header {
/*
* follows data payload
*/
+struct ceph_msg_footer_old {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
struct ceph_msg_footer {
__le32 front_crc, middle_crc, data_crc;
+ // sig holds the 64 bits of the digital signature for the message PLR
+ __le64 sig;
__u8 flags;
} __attribute__ ((packed));
#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */
#endif
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce6df39f60ff..50b14a5661c7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -1,55 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
+#include <linux/bitrev.h>
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/mempool.h>
#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/ktime.h>
#include <linux/ceph/types.h>
#include <linux/ceph/osdmap.h>
#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
-/*
- * Maximum object name size
- * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define MAX_OBJ_NAME_SIZE 100
-
struct ceph_msg;
struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;
-struct ceph_authorizer;
/*
* completion callback for async writepages
*/
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
- struct ceph_msg *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
+
+#define CEPH_HOMELESS_OSD -1
+
+/*
+ * A single extent in a SPARSE_READ reply.
+ *
+ * Note that these come from the OSD as little-endian values. On BE arches,
+ * we convert them in-place after receipt.
+ */
+struct ceph_sparse_extent {
+ u64 off;
+ u64 len;
+} __packed;
+
+/* Sparse read state machine state values */
+enum ceph_sparse_read_state {
+ CEPH_SPARSE_READ_HDR = 0,
+ CEPH_SPARSE_READ_EXTENTS,
+ CEPH_SPARSE_READ_DATA_LEN,
+ CEPH_SPARSE_READ_DATA_PRE,
+ CEPH_SPARSE_READ_DATA,
+};
+
+/*
+ * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of
+ * 64-bit offset/length pairs, and then all of the actual file data
+ * concatenated after it (sans holes).
+ *
+ * Unfortunately, we don't know how long the extent array is until we've
+ * started reading the data section of the reply. The caller should send down
+ * a destination buffer for the array, but we'll alloc one if it's too small
+ * or if the caller doesn't.
+ */
+struct ceph_sparse_read {
+ enum ceph_sparse_read_state sr_state; /* state machine state */
+ u64 sr_req_off; /* orig request offset */
+ u64 sr_req_len; /* orig request length */
+ u64 sr_pos; /* current pos in buffer */
+ int sr_index; /* current extent index */
+ u32 sr_datalen; /* length of actual data */
+ u32 sr_count; /* extent count in reply */
+ int sr_ext_len; /* length of extent array */
+ struct ceph_sparse_extent *sr_extent; /* extent array */
+};
-/* a given osd we're communicating with */
+/*
+ * A given osd we're communicating with.
+ *
+ * Note that the o_requests tree can be searched while holding the "lock" mutex
+ * or the "o_requests_lock" spinlock. Insertion or removal requires both!
+ */
struct ceph_osd {
- atomic_t o_ref;
+ refcount_t o_ref;
+ int o_sparse_op_idx;
struct ceph_osd_client *o_osdc;
int o_osd;
int o_incarnation;
struct rb_node o_node;
struct ceph_connection o_con;
- struct list_head o_requests;
- struct list_head o_linger_requests;
+ spinlock_t o_requests_lock;
+ struct rb_root o_requests;
+ struct rb_root o_linger_requests;
+ struct rb_root o_backoff_mappings;
+ struct rb_root o_backoffs_by_id;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
- int o_marked_for_keepalive;
struct list_head o_keepalive_item;
+ struct mutex lock;
+ struct ceph_sparse_read o_sparse_read;
};
-
-#define CEPH_OSD_MAX_OP 2
+#define CEPH_OSD_SLAB_OPS 2
+#define CEPH_OSD_MAX_OPS 16
enum ceph_osd_data_type {
CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -58,6 +108,8 @@ enum ceph_osd_data_type {
#ifdef CONFIG_BLOCK
CEPH_OSD_DATA_TYPE_BIO,
#endif /* CONFIG_BLOCK */
+ CEPH_OSD_DATA_TYPE_BVECS,
+ CEPH_OSD_DATA_TYPE_ITER,
};
struct ceph_osd_data {
@@ -73,25 +125,43 @@ struct ceph_osd_data {
struct ceph_pagelist *pagelist;
#ifdef CONFIG_BLOCK
struct {
- struct bio *bio; /* list of bios */
- size_t bio_length; /* total in list */
+ struct ceph_bio_iter bio_pos;
+ u32 bio_length;
};
#endif /* CONFIG_BLOCK */
+ struct {
+ struct ceph_bvec_iter bvec_pos;
+ u32 num_bvecs;
+ };
+ struct iov_iter iter;
};
};
struct ceph_osd_req_op {
u16 op; /* CEPH_OSD_OP_* */
- u32 payload_len;
+ u32 flags; /* CEPH_OSD_OP_FLAG_* */
+ u32 indata_len; /* request */
+ u32 outdata_len; /* reply */
+ s32 rval;
+
union {
struct ceph_osd_data raw_data_in;
struct {
u64 offset, length;
u64 truncate_size;
u32 truncate_seq;
+ int sparse_ext_cnt;
+ struct ceph_sparse_extent *sparse_ext;
struct ceph_osd_data osd_data;
} extent;
struct {
+ u32 name_len;
+ u32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ struct ceph_osd_data osd_data;
+ } xattr;
+ struct {
const char *class_name;
const char *method_name;
struct ceph_osd_data request_info;
@@ -99,112 +169,258 @@ struct ceph_osd_req_op {
struct ceph_osd_data response_data;
__u8 class_len;
__u8 method_len;
- __u8 argc;
+ u32 indata_len;
} cls;
struct {
u64 cookie;
- u64 ver;
- u32 prot_ver;
- u32 timeout;
- __u8 flag;
+ __u8 op; /* CEPH_OSD_WATCH_OP_ */
+ u32 gen;
} watch;
+ struct {
+ struct ceph_osd_data request_data;
+ } notify_ack;
+ struct {
+ u64 cookie;
+ struct ceph_osd_data request_data;
+ struct ceph_osd_data response_data;
+ } notify;
+ struct {
+ struct ceph_osd_data response_data;
+ } list_watchers;
+ struct {
+ u64 expected_object_size;
+ u64 expected_write_size;
+ u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } alloc_hint;
+ struct {
+ u64 snapid;
+ u64 src_version;
+ u8 flags;
+ u32 src_fadvise_flags;
+ struct ceph_osd_data osd_data;
+ } copy_from;
+ struct {
+ u64 ver;
+ } assert_ver;
};
};
+struct ceph_osd_request_target {
+ struct ceph_object_id base_oid;
+ struct ceph_object_locator base_oloc;
+ struct ceph_object_id target_oid;
+ struct ceph_object_locator target_oloc;
+
+ struct ceph_pg pgid; /* last raw pg we mapped to */
+ struct ceph_spg spgid; /* last actual spg we mapped to */
+ u32 pg_num;
+ u32 pg_num_mask;
+ struct ceph_osds acting;
+ struct ceph_osds up;
+ int size;
+ int min_size;
+ bool sort_bitwise;
+ bool recovery_deletes;
+
+ unsigned int flags; /* CEPH_OSD_FLAG_* */
+ bool used_replica;
+ bool paused;
+
+ u32 epoch;
+ u32 last_force_resend;
+
+ int osd;
+};
+
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
- struct list_head r_req_lru_item;
- struct list_head r_osd_item;
- struct list_head r_linger_item;
- struct list_head r_linger_osd;
+ struct rb_node r_mc_node; /* map check */
+ struct work_struct r_complete_work;
struct ceph_osd *r_osd;
- struct ceph_pg r_pgid;
- int r_pg_osds[CEPH_PG_MAX_SIZE];
- int r_num_pg_osds;
+
+ struct ceph_osd_request_target r_t;
+#define r_base_oid r_t.base_oid
+#define r_base_oloc r_t.base_oloc
+#define r_flags r_t.flags
struct ceph_msg *r_request, *r_reply;
- int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */
/* request osd ops array */
unsigned int r_num_ops;
- struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP];
-
- /* these are updated on each send */
- __le32 *r_request_osdmap_epoch;
- __le32 *r_request_flags;
- __le64 *r_request_pool;
- void *r_request_pgid;
- __le32 *r_request_attempts;
- struct ceph_eversion *r_request_reassert_version;
int r_result;
- int r_reply_op_len[CEPH_OSD_MAX_OP];
- s32 r_reply_op_result[CEPH_OSD_MAX_OP];
- int r_got_reply;
- int r_linger;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
- struct completion r_completion, r_safe_completion;
+ bool r_linger; /* don't resend on failure */
+ struct completion r_completion; /* private to osd_client.c */
ceph_osdc_callback_t r_callback;
- ceph_osdc_unsafe_callback_t r_unsafe_callback;
- struct ceph_eversion r_reassert_version;
- struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
+ struct list_head r_private_item; /* ditto */
void *r_priv; /* ditto */
- char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
- int r_oid_len;
- u64 r_snapid;
- unsigned long r_stamp; /* send OR check time */
+ /* set by submitter */
+ u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
+ struct ceph_snap_context *r_snapc; /* for writes */
+ struct timespec64 r_mtime; /* ditto */
+ u64 r_data_offset; /* ditto */
+
+ /* internal */
+ u64 r_version; /* data version sent in reply */
+ unsigned long r_stamp; /* jiffies, send or check time */
+ unsigned long r_start_stamp; /* jiffies */
+ ktime_t r_start_latency; /* ktime_t */
+ ktime_t r_end_latency; /* ktime_t */
+ int r_attempts;
+ u32 r_map_dne_bound;
+
+ struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops);
+};
- struct ceph_file_layout r_file_layout;
- struct ceph_snap_context *r_snapc; /* snap context for writes */
+struct ceph_request_redirect {
+ struct ceph_object_locator oloc;
};
-struct ceph_osd_event {
- u64 cookie;
- int one_shot;
+/*
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request
+ */
+struct ceph_osd_reqid {
+ struct ceph_entity_name name;
+ __le64 tid;
+ __le32 inc;
+} __packed;
+
+struct ceph_blkin_trace_info {
+ __le64 trace_id;
+ __le64 span_id;
+ __le64 parent_span_id;
+} __packed;
+
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+ u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
struct ceph_osd_client *osdc;
- void (*cb)(u64, u64, u8, void *);
+ u64 linger_id;
+ bool committed;
+ bool is_watch; /* watch or notify */
+
+ struct ceph_osd *osd;
+ struct ceph_osd_request *reg_req;
+ struct ceph_osd_request *ping_req;
+ unsigned long ping_sent;
+ unsigned long watch_valid_thru;
+ struct list_head pending_lworks;
+
+ struct ceph_osd_request_target t;
+ u32 map_dne_bound;
+
+ struct timespec64 mtime;
+
+ struct kref kref;
+ struct mutex lock;
+ struct rb_node node; /* osd */
+ struct rb_node osdc_node; /* osdc */
+ struct rb_node mc_node; /* map check */
+ struct list_head scan_item;
+
+ struct completion reg_commit_wait;
+ struct completion notify_finish_wait;
+ int reg_commit_error;
+ int notify_finish_error;
+ int last_error;
+
+ u32 register_gen;
+ u64 notify_id;
+
+ rados_watchcb2_t wcb;
+ rados_watcherrcb_t errcb;
void *data;
+
+ struct ceph_pagelist *request_pl;
+ struct page **notify_id_pages;
+
+ struct page ***preply_pages;
+ size_t *preply_len;
+};
+
+struct ceph_watch_item {
+ struct ceph_entity_name name;
+ u64 cookie;
+ struct ceph_entity_addr addr;
+};
+
+struct ceph_spg_mapping {
struct rb_node node;
- struct list_head osd_node;
- struct kref kref;
+ struct ceph_spg spgid;
+
+ struct rb_root backoffs;
};
-struct ceph_osd_event_work {
- struct work_struct work;
- struct ceph_osd_event *event;
- u64 ver;
- u64 notify_id;
- u8 opcode;
+struct ceph_hobject_id {
+ void *key;
+ size_t key_len;
+ void *oid;
+ size_t oid_len;
+ u64 snapid;
+ u32 hash;
+ u8 is_max;
+ void *nspace;
+ size_t nspace_len;
+ s64 pool;
+
+ /* cache */
+ u32 hash_reverse_bits;
};
+static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
+{
+ hoid->hash_reverse_bits = bitrev32(hoid->hash);
+}
+
+/*
+ * PG-wide backoff: [begin, end)
+ * per-object backoff: begin == end
+ */
+struct ceph_osd_backoff {
+ struct rb_node spg_node;
+ struct rb_node id_node;
+
+ struct ceph_spg spgid;
+ u64 id;
+ struct ceph_hobject_id *begin;
+ struct ceph_hobject_id *end;
+};
+
+#define CEPH_LINGER_ID_START 0xffff000000000000ULL
+
struct ceph_osd_client {
struct ceph_client *client;
struct ceph_osdmap *osdmap; /* current map */
- struct rw_semaphore map_sem;
- struct completion map_waiters;
- u64 last_requested_map;
+ struct rw_semaphore lock;
- struct mutex request_mutex;
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
- u64 timeout_tid; /* tid of timeout triggering rq */
- u64 last_tid; /* tid of last request */
- struct rb_root requests; /* pending requests */
- struct list_head req_lru; /* in-flight lru */
- struct list_head req_unsent; /* unsent/need-resend queue */
- struct list_head req_notarget; /* map to no osd */
- struct list_head req_linger; /* lingering requests */
- int num_requests;
+ spinlock_t osd_lru_lock;
+ u32 epoch_barrier;
+ struct ceph_osd homeless_osd;
+ atomic64_t last_tid; /* tid of last request */
+ u64 last_linger_id;
+ struct rb_root linger_requests; /* lingering requests */
+ struct rb_root map_checks;
+ struct rb_root linger_map_checks;
+ atomic_t num_requests;
+ atomic_t num_homeless;
+ int abort_err;
struct delayed_work timeout_work;
struct delayed_work osds_timeout_work;
#ifdef CONFIG_DEBUG_FS
@@ -216,27 +432,39 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
- spinlock_t event_lock;
- struct rb_root event_tree;
- u64 event_count;
-
struct workqueue_struct *notify_wq;
+ struct workqueue_struct *completion_wq;
};
+static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
+{
+ return osdc->osdmap->flags & flag;
+}
+
extern int ceph_osdc_setup(void);
extern void ceph_osdc_cleanup(void);
extern int ceph_osdc_init(struct ceph_osd_client *osdc,
struct ceph_client *client);
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
+void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
-extern void osd_req_op_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode);
+#define osd_req_op_data(oreq, whch, typ, fld) \
+({ \
+ struct ceph_osd_request *__oreq = (oreq); \
+ unsigned int __whch = (whch); \
+ BUG_ON(__whch >= __oreq->r_num_ops); \
+ &__oreq->r_ops[__whch].typ.fld; \
+})
+
+struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode, u32 flags);
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
unsigned int which,
@@ -250,113 +478,166 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
u64 truncate_size, u32 truncate_seq);
extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
unsigned int which, u64 length);
+extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 offset_inc);
extern struct ceph_osd_data *osd_req_op_extent_osd_data(
struct ceph_osd_request *osd_req,
unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
- struct ceph_osd_request *osd_req,
- unsigned int which);
extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
unsigned int which,
struct page **pages, u64 length,
u32 alignment, bool pages_from_pool,
bool own_pages);
-extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
- unsigned int which,
- struct ceph_pagelist *pagelist);
#ifdef CONFIG_BLOCK
-extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
- unsigned int which,
- struct bio *bio, size_t bio_length);
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length);
#endif /* CONFIG_BLOCK */
+void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes);
+void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bvec_iter *bvec_pos);
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+ unsigned int which, struct iov_iter *iter);
-extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
- unsigned int which,
- struct ceph_pagelist *pagelist);
extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
unsigned int which,
struct page **pages, u64 length,
u32 alignment, bool pages_from_pool,
bool own_pages);
+void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes);
extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
unsigned int which,
struct page **pages, u64 length,
u32 alignment, bool pages_from_pool,
bool own_pages);
-
-extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- const char *class, const char *method);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- u64 cookie, u64 version, int flag);
+int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+ const char *class, const char *method);
+extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *name, const void *value,
+ size_t size, u8 cmp_op, u8 cmp_mode);
+extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size,
+ u32 flags);
+extern int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
+ u8 copy_from_flags);
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
unsigned int num_ops,
bool use_mempool,
gfp_t gfp_flags);
-
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
- struct ceph_snap_context *snapc,
- u64 snap_id,
- struct timespec *mtime);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
struct ceph_vino vino,
u64 offset, u64 *len,
- int num_ops, int opcode, int flags,
+ unsigned int which, int num_ops,
+ int opcode, int flags,
struct ceph_snap_context *snapc,
u32 truncate_seq, u64 truncate_size,
bool use_mempool);
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
- kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+/*
+ * How big an extent array should we preallocate for a sparse read? This is
+ * just a starting value. If we get more than this back from the OSD, the
+ * receiver will reallocate.
+ */
+#define CEPH_SPARSE_EXT_ARRAY_INITIAL 16
+
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
{
- kref_put(&req->r_kref, ceph_osdc_release_request);
+ if (!cnt)
+ cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL;
+
+ return __ceph_alloc_sparse_ext_map(op, cnt);
}
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail);
+extern void ceph_osdc_get_request(struct ceph_osd_request *req);
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
+
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int nr_pages,
- int page_align);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
- struct page **pages, int nr_pages);
-
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
- void (*event_cb)(u64, u64, u8, void *),
- void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
-#endif
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
+
+int ceph_osdc_call(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ const char *class, const char *method,
+ unsigned int flags,
+ struct page *req_page, size_t req_len,
+ struct page **resp_pages, size_t *resp_len);
+
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ u32 payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ u32 payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len);
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers);
+
+/* Find offset into the buffer of the end of the extent map */
+static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op)
+{
+ struct ceph_sparse_extent *ext;
+ /* No extents? No data */
+ if (op->extent.sparse_ext_cnt == 0)
+ return 0;
+
+ ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1];
+
+ return ext->off + ext->len - op->extent.offset;
+}
+
+#endif
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index d05cc4451af6..5553019c3f07 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -1,10 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_OSDMAP_H
#define _FS_CEPH_OSDMAP_H
#include <linux/rbtree.h>
#include <linux/ceph/types.h>
#include <linux/ceph/decode.h>
-#include <linux/ceph/ceph_fs.h>
#include <linux/crush/crush.h>
/*
@@ -24,93 +24,199 @@ struct ceph_pg {
uint32_t seed;
};
-#define CEPH_POOL_FLAG_HASHPSPOOL 1
+#define CEPH_SPG_NOSHARD -1
+
+struct ceph_spg {
+ struct ceph_pg pgid;
+ s8 shard;
+};
+
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
+
+#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
+ together */
+#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
+#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota,
+ will set FULL too */
+#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */
struct ceph_pg_pool_info {
struct rb_node node;
s64 id;
- u8 type;
+ u8 type; /* CEPH_POOL_TYPE_* */
u8 size;
+ u8 min_size;
u8 crush_ruleset;
u8 object_hash;
+ u32 last_force_request_resend;
u32 pg_num, pgp_num;
int pg_num_mask, pgp_num_mask;
- u64 flags;
+ s64 read_tier;
+ s64 write_tier; /* wins for read+write ops */
+ u64 flags; /* CEPH_POOL_FLAG_* */
char *name;
+
+ bool was_full; /* for handle_one_map() */
};
+static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
+{
+ switch (pool->type) {
+ case CEPH_POOL_TYPE_REP:
+ return true;
+ case CEPH_POOL_TYPE_EC:
+ return false;
+ default:
+ BUG();
+ }
+}
+
struct ceph_object_locator {
- uint64_t pool;
- char *key;
+ s64 pool;
+ struct ceph_string *pool_ns;
+};
+
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+ oloc->pool = -1;
+ oloc->pool_ns = NULL;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+ return oloc->pool == -1;
+}
+
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src);
+void ceph_oloc_destroy(struct ceph_object_locator *oloc);
+
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around. It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
+struct ceph_object_id {
+ char *name;
+ char inline_name[CEPH_OID_INLINE_LEN];
+ int name_len;
+};
+
+#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
+
+#define CEPH_DEFINE_OID_ONSTACK(oid) \
+ struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
+
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+ *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+ return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
+struct workspace_manager {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
};
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
- int len;
- int osds[];
+
+ union {
+ struct {
+ int len;
+ int osds[];
+ } pg_temp, pg_upmap;
+ struct {
+ int osd;
+ } primary_temp;
+ struct {
+ int len;
+ int from_to[][2];
+ } pg_upmap_items;
+ };
};
struct ceph_osdmap {
struct ceph_fsid fsid;
u32 epoch;
- u32 mkfs_epoch;
struct ceph_timespec created, modified;
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
- u8 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp;
+ struct rb_root primary_temp;
+
+ /* remap (post-CRUSH, pre-up) */
+ struct rb_root pg_upmap; /* PG := raw set */
+ struct rb_root pg_upmap_items; /* from -> to within raw set */
+
+ u32 *osd_primary_affinity;
+
struct rb_root pg_pools;
u32 pool_max;
/* the CRUSH map specifies the mapping of placement groups to
* the list of osds that store+replicate them. */
struct crush_map *crush;
-};
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
- ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
- ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_pool(l) \
- ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_stripe_unit) *
- le32_to_cpu(l->fl_stripe_count);
-}
+ struct workspace_manager crush_wsm;
+};
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
{
- return le32_to_cpu(l->fl_object_size) *
- le32_to_cpu(l->fl_stripe_count);
+ return osd >= 0 && osd < map->max_osd &&
+ (map->osd_state[osd] & CEPH_OSD_EXISTS);
}
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{
- return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+ return ceph_osd_exists(map, osd) &&
+ (map->osd_state[osd] & CEPH_OSD_UP);
}
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
{
- return map && (map->flags & flag);
+ return !ceph_osd_is_up(map, osd);
}
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
+char *ceph_osdmap_state_str(char *str, int len, u32 state);
+extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
int osd)
@@ -120,18 +226,19 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
return &map->osd_addr[osd];
}
+#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
+
static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
{
__u8 version;
- if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
- pr_warning("incomplete pg encoding");
-
+ if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
+ pr_warn("incomplete pg encoding\n");
return -EINVAL;
}
version = ceph_decode_8(p);
if (version > 1) {
- pr_warning("do not understand pg encoding %d > 1",
+ pr_warn("do not understand pg encoding %d > 1\n",
(int)version);
return -EINVAL;
}
@@ -143,27 +250,90 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
return 0;
}
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr);
+struct ceph_osdmap *ceph_osdmap_alloc(void);
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-/* calculate mapping of a file extent to an object */
-extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 len,
- u64 *bno, u64 *oxoff, u64 *oxlen);
+struct ceph_osds {
+ int osds[CEPH_PG_MAX_SIZE];
+ int size;
+ int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+ set->size = 0;
+ set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+ u32 new_pg_num);
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
+ const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change);
+
+void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
+
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting);
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_spg *spgid);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid);
+
+struct crush_loc {
+ char *cl_type_name;
+ char *cl_name;
+};
+
+struct crush_loc_node {
+ struct rb_node cl_node;
+ struct crush_loc cl_loc; /* pointers into cl_data */
+ char cl_data[];
+};
+
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
+void ceph_clear_crush_locs(struct rb_root *locs);
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
- struct ceph_osdmap *osdmap, uint64_t pool);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid,
- int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid);
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs);
+extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
+ u64 id);
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
#endif
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index 9660d6b0a35d..879bec0863aa 100644
--- a/include/linux/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -1,7 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_CEPH_PAGELIST_H
#define __FS_CEPH_PAGELIST_H
+#include <asm/byteorder.h>
+#include <linux/refcount.h>
#include <linux/list.h>
+#include <linux/types.h>
struct ceph_pagelist {
struct list_head head;
@@ -10,25 +14,12 @@ struct ceph_pagelist {
size_t room;
struct list_head free_list;
size_t num_pages_free;
+ refcount_t refcnt;
};
-struct ceph_pagelist_cursor {
- struct ceph_pagelist *pl; /* pagelist, for error checking */
- struct list_head *page_lru; /* page in list */
- size_t room; /* room remaining to reset to */
-};
-
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
- INIT_LIST_HEAD(&pl->head);
- pl->mapped_tail = NULL;
- pl->length = 0;
- pl->room = 0;
- INIT_LIST_HEAD(&pl->free_list);
- pl->num_pages_free = 0;
-}
+struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags);
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+extern void ceph_pagelist_release(struct ceph_pagelist *pl);
extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
@@ -36,12 +27,6 @@ extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
-extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
- struct ceph_pagelist_cursor *c);
-
-extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
- struct ceph_pagelist_cursor *c);
-
static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
{
__le64 ev = cpu_to_le64(v);
@@ -62,7 +47,7 @@ static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
return ceph_pagelist_append(pl, &v, 1);
}
static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
- char *s, size_t len)
+ char *s, u32 len)
{
int ret = ceph_pagelist_encode_32(pl, len);
if (ret)
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 68c96a508ac2..73c3efbec36c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef CEPH_RADOS_H
#define CEPH_RADOS_H
@@ -50,7 +51,7 @@ struct ceph_timespec {
#define CEPH_PG_LAYOUT_LINEAR 2
#define CEPH_PG_LAYOUT_HYBRID 3
-#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */
/*
* placement group.
@@ -81,8 +82,9 @@ struct ceph_pg_v1 {
*/
#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
-#define CEPH_PG_TYPE_REP 1
-#define CEPH_PG_TYPE_RAID4 2
+#define CEPH_POOL_TYPE_REP 1
+#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
+#define CEPH_POOL_TYPE_EC 3
/*
* stable_mod func is used to control number of placement groups.
@@ -113,8 +115,8 @@ struct ceph_object_layout {
* compound epoch+version, used by storage layer to serialize mutations
*/
struct ceph_eversion {
- __le32 epoch;
__le64 version;
+ __le32 epoch;
} __attribute__ ((packed));
/*
@@ -133,12 +135,18 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSD_IN 0x10000
#define CEPH_OSD_OUT 0
+/* osd primary-affinity. fixed point value: 0x10000 == baseline */
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
/*
* osd map flag bits
*/
-#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC),
+ not set since ~luminous */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC),
+ not set since ~luminous */
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
@@ -148,6 +156,15 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
/*
* The error code to return when an OSD can't handle a write
@@ -167,6 +184,7 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSD_OP_MODE_WR 0x2000
#define CEPH_OSD_OP_MODE_RMW 0x3000
#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
#define CEPH_OSD_OP_TYPE 0x0f00
#define CEPH_OSD_OP_TYPE_LOCK 0x0100
@@ -176,100 +194,135 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSD_OP_TYPE_PG 0x0500
#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \
+ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /** multi **/ \
+ f(CLONERANGE, __CEPH_OSD_OP(WR, MULTI, 1), "clonerange") \
+ f(ASSERT_SRC_VERSION, __CEPH_OSD_OP(RD, MULTI, 2), "assert-src-version") \
+ f(SRC_CMPXATTR, __CEPH_OSD_OP(RD, MULTI, 3), "src-cmpxattr") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ f(SCRUB_STOP, __CEPH_OSD_OP1(SUB, 8), "scrub-stop") \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** lock **/ \
+ f(WRLOCK, __CEPH_OSD_OP(WR, LOCK, 1), "wrlock") \
+ f(WRUNLOCK, __CEPH_OSD_OP(WR, LOCK, 2), "wrunlock") \
+ f(RDLOCK, __CEPH_OSD_OP(WR, LOCK, 3), "rdlock") \
+ f(RDUNLOCK, __CEPH_OSD_OP(WR, LOCK, 4), "rdunlock") \
+ f(UPLOCK, __CEPH_OSD_OP(WR, LOCK, 5), "uplock") \
+ f(DNLOCK, __CEPH_OSD_OP(WR, LOCK, 6), "dnlock") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get")
+
enum {
- /** data **/
- /* read */
- CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
- CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3,
-
- /* fancy read */
- CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
- CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5,
-
- CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6,
- CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7,
-
- /* versioning */
- CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8,
-
- /* write */
- CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
- CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
- CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
- CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
- /* fancy write */
- CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
- CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
- CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
- CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
- CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
- CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
- CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-
- CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
- CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-
- CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
-
- /* omap */
- CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
- CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
- CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
- CEPH_OSD_OP_OMAPGETVALSBYKEYS =
- CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
- CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
- CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
- CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
- CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
- CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
-
- /** multi **/
- CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
- CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
- CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
-
- /** attrs **/
- /* read */
- CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
- CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-
- /* write */
- CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
- CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
- CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
- /** subop **/
- CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
- CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
- CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
- CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
- CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
- CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
- CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
- CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
- CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9,
-
- /** lock **/
- CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
- CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
- CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
- CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
- CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
- CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-
- /** exec **/
- /* note: the RD bit here is wrong; see special-case below in helper */
- CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-
- /** pg **/
- CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
- CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
};
static inline int ceph_osd_op_type_lock(int op)
@@ -344,15 +397,34 @@ enum {
CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
};
enum {
CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
+ CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in
+ the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed
+ in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only
+ once by this client */
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
/* xattr comparison */
enum {
@@ -370,7 +442,46 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
-#define RADOS_NOTIFY_VER 1
+enum {
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+ * cloneid */
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */
+};
+
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
/*
* an individual object operation. each may be accompanied by some data
@@ -378,7 +489,7 @@ enum {
*/
struct ceph_osd_op {
__le16 op; /* CEPH_OSD_OP_* */
- __le32 flags; /* CEPH_OSD_FLAG_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
union {
struct {
__le64 offset, length;
@@ -405,13 +516,37 @@ struct ceph_osd_op {
} __attribute__ ((packed)) snap;
struct {
__le64 cookie;
- __le64 ver;
- __u8 flag; /* 0 = unwatch, 1 = watch */
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __le32 gen; /* registration generation */
} __attribute__ ((packed)) watch;
struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } __attribute__ ((packed)) alloc_hint;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+ /*
+ * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+ * for src object, flags for dest object are in
+ * ceph_osd_op::flags.
+ */
+ __le32 src_fadvise_flags;
+ } __attribute__ ((packed)) copy_from;
};
__le32 payload_len;
} __attribute__ ((packed));
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 000000000000..a4a9962d1e14
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+ struct kref kref;
+ union {
+ struct rb_node node;
+ struct rcu_head rcu;
+ };
+ size_t len;
+ char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+ size_t len);
+extern bool ceph_strings_empty(void);
+
+static inline struct ceph_string *ceph_get_string(struct ceph_string *str)
+{
+ kref_get(&str->kref);
+ return str;
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+ if (!str)
+ return;
+ kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+ const char* str, size_t len)
+{
+ size_t cs_len = cs ? cs->len : 0;
+ if (cs_len != len)
+ return cs_len - len;
+ if (len == 0)
+ return 0;
+ return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x) \
+({ \
+ struct ceph_string *___str; \
+ rcu_read_lock(); \
+ for (;;) { \
+ ___str = rcu_dereference(x); \
+ if (!___str || \
+ kref_get_unless_zero(&___str->kref)) \
+ break; \
+ } \
+ rcu_read_unlock(); \
+ (___str); \
+})
+
+#endif
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
new file mode 100644
index 000000000000..3486636c0e6e
--- /dev/null
+++ b/include/linux/ceph/striper.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CEPH_STRIPER_H
+#define _LINUX_CEPH_STRIPER_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct ceph_file_layout;
+
+void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
+ u64 off, u64 len,
+ u64 *objno, u64 *objoff, u32 *xlen);
+
+struct ceph_object_extent {
+ struct list_head oe_item;
+ u64 oe_objno;
+ u64 oe_off;
+ u64 oe_len;
+};
+
+static inline void ceph_object_extent_init(struct ceph_object_extent *ex)
+{
+ INIT_LIST_HEAD(&ex->oe_item);
+}
+
+/*
+ * Called for each mapped stripe unit.
+ *
+ * @bytes: number of bytes mapped, i.e. the minimum of the full length
+ * requested (file extent length) or the remainder of the stripe
+ * unit within an object
+ */
+typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex,
+ u32 bytes, void *arg);
+
+int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ struct ceph_object_extent *alloc_fn(void *arg),
+ void *alloc_arg,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg);
+int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg);
+
+struct ceph_file_extent {
+ u64 fe_off;
+ u64 fe_len;
+};
+
+static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents,
+ u32 num_file_extents)
+{
+ u64 bytes = 0;
+ u32 i;
+
+ for (i = 0; i < num_file_extents; i++)
+ bytes += file_extents[i].fe_len;
+
+ return bytes;
+}
+
+int ceph_extent_to_file(struct ceph_file_layout *l,
+ u64 objno, u64 objoff, u64 objlen,
+ struct ceph_file_extent **file_extents,
+ u32 *num_file_extents);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
+
+#endif
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h
index d3ff1cf2d27e..bd3d532902d7 100644
--- a/include/linux/ceph/types.h
+++ b/include/linux/ceph/types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_TYPES_H
#define _FS_CEPH_TYPES_H
@@ -23,6 +24,7 @@ struct ceph_vino {
/* context for the caps reservation mechanism */
struct ceph_cap_reservation {
int count;
+ int used;
};