Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755583AbYKNB1a (ORCPT ); Thu, 13 Nov 2008 20:27:30 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1756443AbYKNBWz (ORCPT ); Thu, 13 Nov 2008 20:22:55 -0500 Received: from cobra.newdream.net ([66.33.216.30]:38159 "EHLO cobra.newdream.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756147AbYKNBVh (ORCPT ); Thu, 13 Nov 2008 20:21:37 -0500 From: Sage Weil To: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org, Sage Weil Subject: [PATCH 02/19] ceph: on-wire types Date: Thu, 13 Nov 2008 16:56:01 -0800 Message-Id: <1226624178-3761-3-git-send-email-sage@newdream.net> X-Mailer: git-send-email 1.5.6.5 In-Reply-To: <1226624178-3761-2-git-send-email-sage@newdream.net> References: <1226624178-3761-1-git-send-email-sage@newdream.net> <1226624178-3761-2-git-send-email-sage@newdream.net> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 36407 Lines: 1253 This header describes the types used to exchange messages between the Ceph client and various servers. All types are little-endian and packed. Additionally, we define a few magic values to identify the current version of the protocol(s) in use, so that discrepancies to be detected on mount. Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 1225 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 1225 insertions(+), 0 deletions(-) create mode 100644 fs/ceph/ceph_fs.h diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h new file mode 100644 index 0000000..16b9d50 --- /dev/null +++ b/fs/ceph/ceph_fs.h @@ -0,0 +1,1225 @@ +/* + * ceph_fs.h - Ceph constants and data types to share between kernel and + * user space. + * + * LGPL2 + */ + +#ifndef _FS_CEPH_CEPH_FS_H +#define _FS_CEPH_CEPH_FS_H + + +#define CEPH_MON_PORT 12345 + +/* + * Max file size is a policy choice; in reality we are limited + * by 2^64. + */ +#define CEPH_FILE_MAX_SIZE (1ULL << 40) /* 1 TB */ + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph 007\n" +#define CEPH_BANNER_MAX_LEN 30 + +/* + * subprotocol versions. when specific messages types or high-level + * protocols change, bump the affected components. + */ +#define CEPH_OSD_PROTOCOL 3 +#define CEPH_MDS_PROTOCOL 2 +#define CEPH_MON_PROTOCOL 2 +#define CEPH_CLIENT_PROTOCOL 1 +#define CEPH_ONDISK_FORMAT 1 + +/* + * types in this file are defined as little-endian, and are + * primarily intended to describe data structures that pass + * over the wire or that are stored on disk. + */ + +/* + * some basics + */ +typedef __le64 ceph_version_t; +typedef __le64 ceph_tid_t; /* transaction id */ +typedef __le32 ceph_epoch_t; + + +/* + * fs id + */ +struct ceph_fsid { + __le64 major; + __le64 minor; +} __attribute__ ((packed)); + +static inline int ceph_fsid_equal(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return a->major == b->major && a->minor == b->minor; +} + + +/* + * ino, object, etc. + */ +#define CEPH_INO_ROOT 1 + +typedef __le64 ceph_snapid_t; +#define CEPH_MAXSNAP ((__u64)(-3)) +#define CEPH_SNAPDIR ((__u64)(-1)) +#define CEPH_NOSNAP ((__u64)(-2)) + +struct ceph_object { + union { + __u8 raw[20]; /* fits a sha1 hash */ + struct { + __le64 ino; /* inode "file" identifier */ + __le32 bno; /* "block" (object) in that "file" */ + __le64 snap; /* snapshot id. usually NOSNAP. */ + } __attribute__ ((packed)); + }; +} __attribute__ ((packed)); + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * "Frags" are a way to describe a subset of a 32-bit number space, + * using a mask and a value to match against that mask. Any given frag + * (subset of the number space) can be partitioned into 2^n sub-frags. + * + * Frags are encoded into a 32-bit word: + * 8 upper bits = "bits" + * 24 lower bits = "value" + * (We could go to 5+27 bits, but who cares.) + * + * We use the _most_ significant bits of the 24 bit value. This makes + * values logically sort. + * + * Unfortunately, because the "bits" field is still in the high bits, we + * can't sort encoded frags numerically. However, it does allow you + * to feed encoded frags as values into frag_contains_value. + */ +static inline __u32 frag_make(__u32 b, __u32 v) +{ + return (b << 24) | + (v & (0xffffffu << (24-b)) & 0xffffffu); +} +static inline __u32 frag_bits(__u32 f) +{ + return f >> 24; +} +static inline __u32 frag_value(__u32 f) +{ + return f & 0xffffffu; +} +static inline __u32 frag_mask(__u32 f) +{ + return (0xffffffu << (24-frag_bits(f))) & 0xffffffu; +} +static inline __u32 frag_mask_shift(__u32 f) +{ + return 24 - frag_bits(f); +} + +static inline int frag_contains_value(__u32 f, __u32 v) +{ + return (v & frag_mask(f)) == frag_value(f); +} +static inline int frag_contains_frag(__u32 f, __u32 sub) +{ + /* is sub as specific as us, and contained by us? */ + return frag_bits(sub) >= frag_bits(f) && + (frag_value(sub) & frag_mask(f)) == frag_value(f); +} + +static inline __u32 frag_parent(__u32 f) +{ + return frag_make(frag_bits(f) - 1, + frag_value(f) & (frag_mask(f) << 1)); +} +static inline int frag_is_left_child(__u32 f) +{ + return frag_bits(f) > 0 && + (frag_value(f) & (0x1000000 >> frag_bits(f))) == 0; +} +static inline int frag_is_right_child(__u32 f) +{ + return frag_bits(f) > 0 && + (frag_value(f) & (0x1000000 >> frag_bits(f))) == 1; +} +static inline __u32 frag_sibling(__u32 f) +{ + return frag_make(frag_bits(f), + frag_value(f) ^ (0x1000000 >> frag_bits(f))); +} +static inline __u32 frag_left_child(__u32 f) +{ + return frag_make(frag_bits(f)+1, frag_value(f)); +} +static inline __u32 frag_right_child(__u32 f) +{ + return frag_make(frag_bits(f)+1, + frag_value(f) | (0x1000000 >> (1+frag_bits(f)))); +} +static inline __u32 frag_make_child(__u32 f, int by, int i) +{ + int newbits = frag_bits(f) + by; + return frag_make(newbits, + frag_value(f) | (i << (24 - newbits))); +} +static inline int frag_is_leftmost(__u32 f) +{ + return frag_value(f) == 0; +} +static inline int frag_is_rightmost(__u32 f) +{ + return frag_value(f) == frag_mask(f); +} +static inline __u32 frag_next(__u32 f) +{ + return frag_make(frag_bits(f), + frag_value(f) + (0x1000000 >> frag_bits(f))); +} + +/* + * comparator to sort frags logically, as when traversing the + * number space in ascending order... + */ +static inline int frag_compare(__u32 a, __u32 b) +{ + unsigned va = frag_value(a); + unsigned vb = frag_value(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + va = frag_bits(a); + vb = frag_bits(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + __le32 fl_stripe_count; /* over this many objects */ + __le32 fl_object_size; /* until objects are this big, then move to + new objects */ + __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + + /* object -> pg layout */ + __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ + __u8 fl_pg_type; /* pg type; see PG_TYPE_* */ + __u8 fl_pg_size; /* pg size (num replicas, etc.) */ + __u8 fl_pg_pool; /* implies crush ruleset AND object namespace */ +} __attribute__ ((packed)); + +#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) +#define ceph_file_layout_stripe_count(l) \ + ((__s32)le32_to_cpu((l).fl_stripe_count)) +#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) +#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) +#define ceph_file_layout_object_su(l) \ + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) +#define ceph_file_layout_pg_preferred(l) \ + ((__s32)le32_to_cpu((l).fl_pg_preferred)) + +#define ceph_file_layout_stripe_width(l) (le32_to_cpu((l).fl_stripe_unit) * \ + le32_to_cpu((l).fl_stripe_count)) + +/* "period" == bytes before i start on a new set of objects */ +#define ceph_file_layout_period(l) (le32_to_cpu((l).fl_object_size) * \ + le32_to_cpu((l).fl_stripe_count)) + +/* + * placement group. + * we encode this into one __le64. + */ +#define CEPH_PG_TYPE_REP 1 +#define CEPH_PG_TYPE_RAID4 2 +union ceph_pg { + __u64 pg64; + struct { + __s16 preferred; /* preferred primary osd */ + __u16 ps; /* placement seed */ + __u8 __pad; + __u8 size; + __u8 pool; /* implies crush ruleset */ + __u8 type; + } pg; +} __attribute__ ((packed)); + +#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP) +#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4) + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + __le64 ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + ceph_epoch_t epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS 1 +#define CEPH_OSD_UP 2 +#define CEPH_OSD_CLEAN 4 /* as in, clean shutdown */ + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + + +/* + * string hash. + * + * taken from Linux, tho we should probably take care to use this one + * in case the upstream hash changes. + */ + +/* Name hashing routines. Initial hash value */ +/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ +#define ceph_init_name_hash() 0 + +/* partial hash update function. Assume roughly 4 bits per character */ +static inline unsigned long +ceph_partial_name_hash(unsigned long c, unsigned long prevhash) +{ + return (prevhash + (c << 4) + (c >> 4)) * 11; +} + +/* + * Finally: cut down the number of bits to a int value (and try to avoid + * losing bits) + */ +static inline unsigned long ceph_end_name_hash(unsigned long hash) +{ + return (unsigned int) hash; +} + +/* Compute the hash for a name string. */ +static inline unsigned int +ceph_full_name_hash(const char *name, unsigned int len) +{ + unsigned long hash = ceph_init_name_hash(); + while (len--) + hash = ceph_partial_name_hash(*name++, hash); + return ceph_end_name_hash(hash); +} + + + +/********************************************* + * message layer + */ + +/* + * entity_name + */ +struct ceph_entity_name { + __le32 type; + __le32 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 1 +#define CEPH_ENTITY_TYPE_MDS 2 +#define CEPH_ENTITY_TYPE_OSD 3 +#define CEPH_ENTITY_TYPE_CLIENT 4 +#define CEPH_ENTITY_TYPE_ADMIN 5 + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 10 /* message */ +#define CEPH_MSGR_TAG_ACK 11 /* message ack */ + + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 erank; /* entity's rank in process */ + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_in ipaddr; +} __attribute__ ((packed)); + +static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a, + const struct ceph_entity_addr *b) +{ + return le32_to_cpu(a->nonce) == le32_to_cpu(b->nonce) && + a->ipaddr.sin_addr.s_addr == b->ipaddr.sin_addr.s_addr; +} + +static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a, + const struct ceph_entity_addr *b) +{ + return memcmp(a, b, sizeof(*a)) == 0; +} + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; + __le32 connect_seq; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le32 global_seq; + __le32 connect_seq; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 mon_protocol, osd_protocol, mds_protocol, + client_protocol; /* protocol versions */ + __le32 front_len; /* bytes in main payload */ + __le32 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + __le32 data_len; /* bytes of data payload */ + struct ceph_entity_inst src, orig_src, dst; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + */ +struct ceph_msg_footer { + __le32 flags; + __le32 front_crc; + __le32 data_crc; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_ABORTED (1<<0) /* drop this message */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ + + +/* + * message types + */ + +/* misc */ +#define CEPH_MSG_SHUTDOWN 1 +#define CEPH_MSG_PING 2 + +/* client <-> monitor */ +#define CEPH_MSG_MON_MAP 4 +#define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_CLIENT_MOUNT 10 +#define CEPH_MSG_CLIENT_UNMOUNT 11 +#define CEPH_MSG_STATFS 12 +#define CEPH_MSG_STATFS_REPLY 13 + +/* client <-> mds */ +#define CEPH_MSG_MDS_GETMAP 20 +#define CEPH_MSG_MDS_MAP 21 + +#define CEPH_MSG_CLIENT_SESSION 22 +#define CEPH_MSG_CLIENT_RECONNECT 23 + +#define CEPH_MSG_CLIENT_REQUEST 24 +#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 +#define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_CAPS 0x310 +#define CEPH_MSG_CLIENT_LEASE 0x311 +#define CEPH_MSG_CLIENT_SNAP 0x312 + +/* osd */ +#define CEPH_MSG_OSD_GETMAP 40 +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 + + +struct ceph_mon_statfs { + struct ceph_fsid fsid; + __le64 tid; +}; + +struct ceph_statfs { + __le64 f_total; + __le64 f_free; /* used = total - free (KB) */ + __le64 f_avail; /* usable */ + __le64 f_objects; +}; + +struct ceph_mon_statfs_reply { + struct ceph_fsid fsid; + __le64 tid; + struct ceph_statfs st; +}; + +struct ceph_osd_getmap { + struct ceph_fsid fsid; + __le32 start; +} __attribute__ ((packed)); + +struct ceph_mds_getmap { + struct ceph_fsid fsid; + __le32 want; +} __attribute__ ((packed)); + + +/* + * mds states + * > 0 -> in + * <= 0 -> out + */ +#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ +#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. + empty log. */ +#define CEPH_MDS_STATE_DESTROYING -2 /* down, existing, semi-destroyed. */ +#define CEPH_MDS_STATE_FAILED 3 /* down, needs to be recovered. */ + +#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ +#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ +#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ +#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds. */ + +#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ +#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed + operations (import, rename, etc.) */ +#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ +#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ +#define CEPH_MDS_STATE_ACTIVE 12 /* up, active */ +#define CEPH_MDS_STATE_STOPPING 13 /* up, but exporting metadata */ + + +/* + * metadata lock types. + * - these are bitmasks.. we can compose them + * - they also define the lock ordering by the MDS + * - a few of these are internal to the mds + */ +#define CEPH_LOCK_DN 1 +#define CEPH_LOCK_ISNAP 2 +#define CEPH_LOCK_IVERSION 4 /* mds internal */ +#define CEPH_LOCK_IFILE 8 /* mds internal */ +#define CEPH_LOCK_IDIR 16 /* mds internal */ +#define CEPH_LOCK_IAUTH 32 +#define CEPH_LOCK_ILINK 64 +#define CEPH_LOCK_IDFT 128 /* dir frag tree */ +#define CEPH_LOCK_INEST 256 /* mds internal */ +#define CEPH_LOCK_IXATTR 512 +#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ + +/* alias for either filelock or dirlock */ +#define CEPH_LOCK_ICONTENT (CEPH_LOCK_IFILE|CEPH_LOCK_IDIR) + +/* + * stat masks are defined in terms of the locks that cover inode fields. + */ +#define CEPH_STAT_MASK_INODE CEPH_LOCK_INO +#define CEPH_STAT_MASK_TYPE CEPH_LOCK_INO /* mode >> 12 */ +#define CEPH_STAT_MASK_SYMLINK CEPH_LOCK_INO +#define CEPH_STAT_MASK_UID CEPH_LOCK_IAUTH +#define CEPH_STAT_MASK_GID CEPH_LOCK_IAUTH +#define CEPH_STAT_MASK_MODE CEPH_LOCK_IAUTH +#define CEPH_STAT_MASK_NLINK CEPH_LOCK_ILINK +#define CEPH_STAT_MASK_LAYOUT CEPH_LOCK_ICONTENT +#define CEPH_STAT_MASK_MTIME CEPH_LOCK_ICONTENT +#define CEPH_STAT_MASK_SIZE CEPH_LOCK_ICONTENT +#define CEPH_STAT_MASK_ATIME CEPH_LOCK_ICONTENT /* fixme */ +#define CEPH_STAT_MASK_XATTR CEPH_LOCK_IXATTR +#define CEPH_STAT_MASK_INODE_ALL (CEPH_LOCK_ICONTENT | CEPH_LOCK_IAUTH | \ + CEPH_LOCK_ILINK | CEPH_LOCK_INO) + +/* client_session ops */ +enum { + CEPH_SESSION_REQUEST_OPEN, + CEPH_SESSION_OPEN, + CEPH_SESSION_REQUEST_CLOSE, + CEPH_SESSION_CLOSE, + CEPH_SESSION_REQUEST_RENEWCAPS, + CEPH_SESSION_RENEWCAPS, + CEPH_SESSION_STALE, +}; + +static inline const char *ceph_session_op_name(int op) +{ + switch (op) { + case CEPH_SESSION_REQUEST_OPEN: return "request_open"; + case CEPH_SESSION_OPEN: return "open"; + case CEPH_SESSION_REQUEST_CLOSE: return "request_close"; + case CEPH_SESSION_CLOSE: return "close"; + case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps"; + case CEPH_SESSION_RENEWCAPS: return "renewcaps"; + case CEPH_SESSION_STALE: return "stale"; + default: return "???"; + } +} + +struct ceph_mds_session_head { + __le32 op; + __le64 seq; + struct ceph_timespec stamp; +} __attribute__ ((packed)); + +/* client_request */ +/* + * metadata ops. + * & 0x001000 -> write op + * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). + & & 0x100000 -> use weird ino/path trace + */ +#define CEPH_MDS_OP_WRITE 0x001000 +#define CEPH_MDS_OP_FOLLOW_LINK 0x010000 +#define CEPH_MDS_OP_INO_PATH 0x100000 +enum { + CEPH_MDS_OP_FINDINODE = 0x100100, + + CEPH_MDS_OP_LSTAT = 0x00100, + CEPH_MDS_OP_LUTIME = 0x01101, + CEPH_MDS_OP_LCHMOD = 0x01102, + CEPH_MDS_OP_LCHOWN = 0x01103, + CEPH_MDS_OP_LSETXATTR = 0x01104, + CEPH_MDS_OP_LRMXATTR = 0x01105, + CEPH_MDS_OP_LSETLAYOUT = 0x01106, + + CEPH_MDS_OP_STAT = 0x10100, + CEPH_MDS_OP_UTIME = 0x11101, + CEPH_MDS_OP_CHMOD = 0x11102, + CEPH_MDS_OP_CHOWN = 0x11103, + CEPH_MDS_OP_SETXATTR = 0x11104, + CEPH_MDS_OP_RMXATTR = 0x11105, + + CEPH_MDS_OP_MKNOD = 0x01201, + CEPH_MDS_OP_LINK = 0x01202, + CEPH_MDS_OP_UNLINK = 0x01203, + CEPH_MDS_OP_RENAME = 0x01204, + CEPH_MDS_OP_MKDIR = 0x01220, + CEPH_MDS_OP_RMDIR = 0x01221, + CEPH_MDS_OP_SYMLINK = 0x01222, + + CEPH_MDS_OP_OPEN = 0x10302, + CEPH_MDS_OP_TRUNCATE = 0x11303, + CEPH_MDS_OP_LTRUNCATE = 0x01303, + CEPH_MDS_OP_FSYNC = 0x00304, + CEPH_MDS_OP_READDIR = 0x00305, + + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, +}; + +static inline const char *ceph_mds_op_name(int op) +{ + switch (op) { + case CEPH_MDS_OP_FINDINODE: return "findinode"; + case CEPH_MDS_OP_STAT: return "stat"; + case CEPH_MDS_OP_LSTAT: return "lstat"; + case CEPH_MDS_OP_UTIME: return "utime"; + case CEPH_MDS_OP_LUTIME: return "lutime"; + case CEPH_MDS_OP_CHMOD: return "chmod"; + case CEPH_MDS_OP_LCHMOD: return "lchmod"; + case CEPH_MDS_OP_CHOWN: return "chown"; + case CEPH_MDS_OP_LCHOWN: return "lchown"; + case CEPH_MDS_OP_LSETLAYOUT: return "lsetlayout"; + case CEPH_MDS_OP_SETXATTR: return "setxattr"; + case CEPH_MDS_OP_LSETXATTR: return "lsetxattr"; + case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_LRMXATTR: return "lrmxattr"; + case CEPH_MDS_OP_READDIR: return "readdir"; + case CEPH_MDS_OP_MKNOD: return "mknod"; + case CEPH_MDS_OP_LINK: return "link"; + case CEPH_MDS_OP_UNLINK: return "unlink"; + case CEPH_MDS_OP_RENAME: return "rename"; + case CEPH_MDS_OP_MKDIR: return "mkdir"; + case CEPH_MDS_OP_RMDIR: return "rmdir"; + case CEPH_MDS_OP_SYMLINK: return "symlink"; + case CEPH_MDS_OP_OPEN: return "open"; + case CEPH_MDS_OP_TRUNCATE: return "truncate"; + case CEPH_MDS_OP_LTRUNCATE: return "ltruncate"; + case CEPH_MDS_OP_FSYNC: return "fsync"; + case CEPH_MDS_OP_LSSNAP: return "lssnap"; + case CEPH_MDS_OP_MKSNAP: return "mksnap"; + case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + default: return "???"; + } +} + +struct ceph_mds_request_head { + ceph_tid_t tid, oldest_client_tid; + ceph_epoch_t mdsmap_epoch; /* on client */ + __le32 num_fwd; + __le32 retry_attempt; + __le64 mds_wants_replica_in_dirino; + __le32 op; + __le32 caller_uid, caller_gid; + + union { + struct { + __le32 mask; + } __attribute__ ((packed)) stat; + struct { + __le32 mask; + } __attribute__ ((packed)) fstat; + struct { + __le32 frag; + } __attribute__ ((packed)) readdir; + struct { + struct ceph_timespec mtime; + struct ceph_timespec atime; + struct ceph_timespec ctime; + __le32 mask; + } __attribute__ ((packed)) utime; + struct { + __le32 mode; + } __attribute__ ((packed)) chmod; + struct { + __le32 uid; + __le32 gid; + __le32 mask; + } __attribute__ ((packed)) chown; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + } __attribute__ ((packed)) open; + struct { + __le64 length; + } __attribute__ ((packed)) truncate; + struct { + __le32 flags; + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + } __attribute__ ((packed)) args; +} __attribute__ ((packed)); + +/* masks for utimes() */ +#define CEPH_UTIME_ATIME 1 +#define CEPH_UTIME_MTIME 2 +#define CEPH_UTIME_CTIME 4 + +/* masks for chown */ +#define CEPH_CHOWN_UID 1 +#define CEPH_CHOWN_GID 2 + +struct ceph_inopath_item { + __le64 ino; + __le32 dname_hash; +} __attribute__ ((packed)); + +/* client reply */ +struct ceph_mds_reply_head { + ceph_tid_t tid; + __le32 op; + __le32 result; + __le32 file_caps; + __le32 file_caps_seq; + __le32 file_caps_mseq; + __le32 mdsmap_epoch; +} __attribute__ ((packed)); + +/* one for each node split */ +struct ceph_frag_tree_split { + __le32 frag; /* this frag splits... */ + __le32 by; /* ...by this many bits */ +} __attribute__ ((packed)); + +struct ceph_frag_tree_head { + __le32 nsplits; + struct ceph_frag_tree_split splits[]; +} __attribute__ ((packed)); + +struct ceph_mds_reply_inode { + __le64 ino; + __le64 snapid; + __le64 version; + struct ceph_file_layout layout; + struct ceph_timespec ctime, mtime, atime; + __le64 time_warp_seq; + __le32 rdev; + __le32 mode, uid, gid; + __le32 nlink; + __le64 size, max_size, truncate_seq; + __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ + struct ceph_timespec rctime; + struct ceph_frag_tree_head fragtree; +} __attribute__ ((packed)); +/* followed by frag array, then symlink string, then xattr blob */ + +/* reply_lease follows dname, and reply_inode */ +struct ceph_mds_reply_lease { + __le16 mask; + __le32 duration_ms; +} __attribute__ ((packed)); + +struct ceph_mds_reply_dirfrag { + __le32 frag; /* fragment */ + __le32 auth; /* auth mds, if this is a delegation point */ + __le32 ndist; /* number of mds' this is replicated on */ + __le32 dist[]; +} __attribute__ ((packed)); + +/* file access modes */ +#define CEPH_FILE_MODE_PIN 0 +#define CEPH_FILE_MODE_RD 1 +#define CEPH_FILE_MODE_WR 2 +#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ +#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ +#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ + +static inline int ceph_flags_to_mode(int flags) +{ +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif +#ifdef O_LAZY + if (flags & O_LAZY) + return CEPH_FILE_MODE_LAZY; +#endif + if ((flags & O_APPEND) == O_APPEND) + flags |= O_WRONLY; + + flags &= O_ACCMODE; + if ((flags & O_RDWR) == O_RDWR) + return CEPH_FILE_MODE_RDWR; + if ((flags & O_WRONLY) == O_WRONLY) + return CEPH_FILE_MODE_WR; + return CEPH_FILE_MODE_RD; +} + +/* client file caps */ +#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ +#define CEPH_CAP_RDCACHE 2 /* client can cache reads */ +#define CEPH_CAP_RD 4 /* client can read */ +#define CEPH_CAP_WR 8 /* client can write */ +#define CEPH_CAP_WRBUFFER 16 /* client can buffer writes */ +#define CEPH_CAP_WREXTEND 32 /* client can extend EOF */ +#define CEPH_CAP_LAZYIO 64 /* client can perform lazy io */ +#define CEPH_CAP_EXCL 128 /* exclusive/loner access */ + +static inline int ceph_caps_for_mode(int mode) +{ + switch (mode) { + case CEPH_FILE_MODE_PIN: + return CEPH_CAP_PIN; + case CEPH_FILE_MODE_RD: + return CEPH_CAP_PIN | + CEPH_CAP_RD | CEPH_CAP_RDCACHE; + case CEPH_FILE_MODE_RDWR: + return CEPH_CAP_PIN | + CEPH_CAP_RD | CEPH_CAP_RDCACHE | + CEPH_CAP_WR | CEPH_CAP_WRBUFFER | + CEPH_CAP_EXCL; + case CEPH_FILE_MODE_WR: + return CEPH_CAP_PIN | + CEPH_CAP_WR | CEPH_CAP_WRBUFFER | + CEPH_CAP_EXCL; + } + return 0; +} + +enum { + CEPH_CAP_OP_GRANT, /* mds->client grant */ + CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ + CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ + CEPH_CAP_OP_IMPORT, /* mds has imported the cap from specified mds */ + CEPH_CAP_OP_RELEASED, /* mds->client close out cap */ + CEPH_CAP_OP_FLUSHEDSNAP, /* mds->client flushed snap */ + CEPH_CAP_OP_ACK, /* client->mds ack (if prior grant was recall) */ + CEPH_CAP_OP_REQUEST, /* client->mds request (update wanted bits) */ + CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds request release cap */ +}; + +static inline const char *ceph_cap_op_name(int op) +{ + switch (op) { + case CEPH_CAP_OP_GRANT: return "grant"; + case CEPH_CAP_OP_TRUNC: return "trunc"; + case CEPH_CAP_OP_EXPORT: return "export"; + case CEPH_CAP_OP_IMPORT: return "import"; + case CEPH_CAP_OP_RELEASED: return "released"; + case CEPH_CAP_OP_FLUSHEDSNAP: return "flushedsnap"; + case CEPH_CAP_OP_ACK: return "ack"; + case CEPH_CAP_OP_REQUEST: return "request"; + case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_RELEASE: return "release"; + default: return "???"; + } +} + +/* + * caps message, used for capability callbacks, acks, requests, etc. + */ +struct ceph_mds_caps { + __le32 op; + __le64 ino; + __le32 seq; + __le32 caps, wanted; + __le64 size, max_size; + __le64 truncate_seq; + __le32 migrate_seq; + struct ceph_timespec mtime, atime, ctime; + struct ceph_file_layout layout; + __le64 time_warp_seq; + __le64 snap_follows; + __le32 snap_trace_len; +} __attribute__ ((packed)); + + +#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ +#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ +#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ + +struct ceph_mds_lease { + __u8 action; + __le16 mask; + __le64 ino; + __le64 first, last; +} __attribute__ ((packed)); +/* followed by a __le32+string for dname */ + + +/* client reconnect */ +struct ceph_mds_cap_reconnect { + __le32 wanted; + __le32 issued; + __le64 size; + struct ceph_timespec mtime, atime; + __le64 snaprealm; +} __attribute__ ((packed)); +/* followed by encoded string */ + +struct ceph_mds_snaprealm_reconnect { + __le64 ino; + __le64 seq; + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); + +/* + * snaps + */ +enum { + CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ + CEPH_SNAP_OP_CREATE, + CEPH_SNAP_OP_DESTROY, + CEPH_SNAP_OP_SPLIT, +}; + +static inline const char *ceph_snap_op_name(int o) +{ + switch (o) { + case CEPH_SNAP_OP_UPDATE: return "update"; + case CEPH_SNAP_OP_CREATE: return "create"; + case CEPH_SNAP_OP_DESTROY: return "destroy"; + case CEPH_SNAP_OP_SPLIT: return "split"; + default: return "???"; + } +} + +struct ceph_mds_snap_head { + __le32 op; + __le64 split; + __le32 num_split_inos; + __le32 num_split_realms; + __le32 trace_len; +} __attribute__ ((packed)); +/* followed by split ino list, then split realms, then the trace blob */ + +/* + * encode info about a snaprealm, as viewed by a client + */ +struct ceph_mds_snap_realm { + __le64 ino; /* ino */ + __le64 created; /* snap: when created */ + __le64 parent; /* ino: parent realm */ + __le64 parent_since; /* snap: same parent since */ + __le64 seq; /* snap: version */ + __le32 num_snaps; + __le32 num_prior_parent_snaps; +} __attribute__ ((packed)); +/* followed by my snap list, then prior parent snap list */ + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ + +/* + * osd ops + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_LOCK 0x0100 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 + +enum { + /* read */ + CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, + + CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + + /* subop */ + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, + + /* object data */ + CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, + CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, + CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, + + /* object attrs */ + CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_RESETXATTRS= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 3, + CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, + + /* lock */ + CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, + CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, + CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, + CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, + CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, + CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, + + /* fancy read */ + CEPH_OSD_OP_GREP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, + + /* fancy write */ + CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, +}; + +static inline int ceph_osd_op_type_lock(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; +} +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; +} + +static inline const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + + default: return "???"; + } +} + + +/* + * osd op flags + */ +enum { + CEPH_OSD_OP_ACK = 1, /* want (or is) "ack" ack */ + CEPH_OSD_OP_SAFE = 2, /* want (or is) "safe" ack */ + CEPH_OSD_OP_RETRY = 4, /* resend attempt */ + CEPH_OSD_OP_INCLOCK_FAIL = 8, /* fail on inclock collision */ + CEPH_OSD_OP_MODIFY = 16, /* op is/was a mutation */ + CEPH_OSD_OP_ACKNVRAM = 32, /* ACK when stable in NVRAM, not RAM */ + CEPH_OSD_OP_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_OP_PEERSTAT = 128, /* msg includes osd_peer_stat */ + CEPH_OSD_OP_BALANCE_READS = 256, +}; + +#define EOLDSNAPC 44 /* ORDERSNAP flag set and writer has old snap context*/ + +struct ceph_osd_op { + __le16 op; + union { + struct { + __le64 offset, length; + }; + struct { + __le32 name_len; + __le32 value_len; + }; + }; +} __attribute__ ((packed)); + +struct ceph_osd_request_head { + ceph_tid_t tid; + __le32 client_inc; + struct ceph_object oid; + struct ceph_object_layout layout; + ceph_epoch_t osdmap_epoch; + + __le32 flags; + __le32 inc_lock; + + struct ceph_eversion reassert_version; + + /* writer's snap context */ + __le64 snap_seq; + __le32 num_snaps; + + /* read or mutation */ + __le16 num_ops; + __u16 object_type; + struct ceph_osd_op ops[]; /* followed by snaps */ +} __attribute__ ((packed)); + +struct ceph_osd_reply_head { + ceph_tid_t tid; + __le32 flags; + struct ceph_object oid; + struct ceph_object_layout layout; + ceph_epoch_t osdmap_epoch; + struct ceph_eversion reassert_version; + + __le32 result; + + __le32 num_ops; + struct ceph_osd_op ops[0]; +} __attribute__ ((packed)); + +#endif -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/