2023-08-04 09:15:51

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

From: Christian Brauner <[email protected]>

Inode operations that create a new filesystem object such as ->mknod,
->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
Instead the caller's fs{g,u}id is used for the {g,u}id of the new
filesystem object.

In order to ensure that the correct {g,u}id is used map the caller's
fs{g,u}id for creation requests. This doesn't require complex changes.
It suffices to pass in the relevant idmapping recorded in the request
message. If this request message was triggered from an inode operation
that creates filesystem objects it will have passed down the relevant
idmaping. If this is a request message that was triggered from an inode
operation that doens't need to take idmappings into account the initial
idmapping is passed down which is an identity mapping.

This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
which adds two new fields (owner_{u,g}id) to the request head structure.
So, we need to ensure that MDS supports it otherwise we need to fail
any IO that comes through an idmapped mount because we can't process it
in a proper way. MDS server without such an extension will use caller_{u,g}id
fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
values are unmapped. At the same time we can't map these fields with an
idmapping as it can break UID/GID-based permission checks logic on the
MDS side. This problem was described with a lot of details at [1], [2].

[1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
[2] https://lore.kernel.org/all/[email protected]/

Link: https://github.com/ceph/ceph/pull/52575
Link: https://tracker.ceph.com/issues/62217
Cc: Xiubo Li <[email protected]>
Cc: Jeff Layton <[email protected]>
Cc: Ilya Dryomov <[email protected]>
Cc: [email protected]
Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
Signed-off-by: Alexander Mikhalitsyn <[email protected]>
---
v7:
- reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
v8:
- properly handled case when old MDS used with new kernel client
---
fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
fs/ceph/mds_client.h | 5 +++-
include/linux/ceph/ceph_fs.h | 5 +++-
3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8829f55103da..41e4bf3811c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
}
}

+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+ if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+ return 1;
+
+ if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+ return 2;
+
+ return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void *p, u64 features)
{
@@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
const char *path1 = NULL;
@@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
void *p, *end;
int ret;
bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
- bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
+ u16 request_head_version = mds_supported_head_version(session);

ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
*/
if (legacy)
len = sizeof(struct ceph_mds_request_head_legacy);
- else if (old_version)
+ else if (request_head_version == 1)
len = sizeof(struct ceph_mds_request_head_old);
+ else if (request_head_version == 2)
+ len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
else
len = sizeof(struct ceph_mds_request_head);

@@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead = find_legacy_request_head(msg->front.iov_base,
session->s_con.peer_features);

+ if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+ !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+ pr_err_ratelimited_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. Fail request with -EIO.\n");
+
+ ret = -EIO;
+ goto out_err;
+ }
+
/*
* The ceph_mds_request_head_legacy didn't contain a version field, and
* one was added when we moved the message version from 3->4.
@@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (legacy) {
msg->hdr.version = cpu_to_le16(3);
p = msg->front.iov_base + sizeof(*lhead);
- } else if (old_version) {
+ } else if (request_head_version == 1) {
struct ceph_mds_request_head_old *ohead = msg->front.iov_base;

msg->hdr.version = cpu_to_le16(4);
ohead->version = cpu_to_le16(1);
p = msg->front.iov_base + sizeof(*ohead);
+ } else if (request_head_version == 2) {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(2);
+
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
} else {
struct ceph_mds_request_head *nhead = msg->front.iov_base;
+ kuid_t owner_fsuid;
+ kgid_t owner_fsgid;

msg->hdr.version = cpu_to_le16(6);
nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ nhead->struct_len = sizeof(struct ceph_mds_request_head);
+
+ owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+ nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
p = msg->front.iov_base + sizeof(*nhead);
}

diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e3bbf3ba8ee8..8f683e8203bd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -33,8 +33,10 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
CEPHFS_FEATURE_OP_GETVXATTR,
CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID,

- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
};

#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -49,6 +51,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
}

/*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 5f2301ee88bc..b91699b08f26 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
union ceph_mds_request_args args;
} __attribute__ ((packed));

-#define CEPH_MDS_REQUEST_HEAD_VERSION 2
+#define CEPH_MDS_REQUEST_HEAD_VERSION 3

struct ceph_mds_request_head_old {
__le16 version; /* struct version */
@@ -530,6 +530,9 @@ struct ceph_mds_request_head {

__le32 ext_num_retry; /* new count retry attempts */
__le32 ext_num_fwd; /* new count fwd attempts */
+
+ __le32 struct_len; /* to store size of struct ceph_mds_request_head */
+ __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
} __attribute__ ((packed));

/* cap/lease release record */
--
2.34.1



2023-08-04 15:10:51

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Fri, Aug 04, 2023 at 10:48:49AM +0200, Alexander Mikhalitsyn wrote:
> From: Christian Brauner <[email protected]>
>
> Inode operations that create a new filesystem object such as ->mknod,
> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> filesystem object.
>
> In order to ensure that the correct {g,u}id is used map the caller's
> fs{g,u}id for creation requests. This doesn't require complex changes.
> It suffices to pass in the relevant idmapping recorded in the request
> message. If this request message was triggered from an inode operation
> that creates filesystem objects it will have passed down the relevant
> idmaping. If this is a request message that was triggered from an inode
> operation that doens't need to take idmappings into account the initial
> idmapping is passed down which is an identity mapping.
>
> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> which adds two new fields (owner_{u,g}id) to the request head structure.
> So, we need to ensure that MDS supports it otherwise we need to fail
> any IO that comes through an idmapped mount because we can't process it
> in a proper way. MDS server without such an extension will use caller_{u,g}id
> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> values are unmapped. At the same time we can't map these fields with an
> idmapping as it can break UID/GID-based permission checks logic on the
> MDS side. This problem was described with a lot of details at [1], [2].
>
> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> [2] https://lore.kernel.org/all/[email protected]/
>
> Link: https://github.com/ceph/ceph/pull/52575
> Link: https://tracker.ceph.com/issues/62217
> Cc: Xiubo Li <[email protected]>
> Cc: Jeff Layton <[email protected]>
> Cc: Ilya Dryomov <[email protected]>
> Cc: [email protected]
> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> ---

I like the new extension,
Acked-by: Christian Brauner <[email protected]>

2023-08-05 03:16:52

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

Hi Alexander,

kernel test robot noticed the following build warnings:

[auto build test WARNING on ceph-client/testing]
[cannot apply to ceph-client/for-linus brauner-vfs/vfs.all linus/master vfs-idmapping/for-next v6.5-rc4 next-20230804]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Alexander-Mikhalitsyn/fs-export-mnt_idmap_get-mnt_idmap_put/20230804-165330
base: https://github.com/ceph/ceph-client.git testing
patch link: https://lore.kernel.org/r/20230804084858.126104-4-aleksandr.mikhalitsyn%40canonical.com
patch subject: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()
config: um-randconfig-r091-20230730 (https://download.01.org/0day-ci/archive/20230805/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230805/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

sparse warnings: (new ones prefixed by >>)
>> fs/ceph/mds_client.c:3082:35: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] struct_len @@ got unsigned long @@
fs/ceph/mds_client.c:3082:35: sparse: expected restricted __le32 [usertype] struct_len
fs/ceph/mds_client.c:3082:35: sparse: got unsigned long

vim +3082 fs/ceph/mds_client.c

2927
2928 /*
2929 * called under mdsc->mutex
2930 */
2931 static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
2932 struct ceph_mds_request *req,
2933 bool drop_cap_releases)
2934 {
2935 int mds = session->s_mds;
2936 struct ceph_mds_client *mdsc = session->s_mdsc;
2937 struct ceph_client *cl = mdsc->fsc->client;
2938 struct ceph_msg *msg;
2939 struct ceph_mds_request_head_legacy *lhead;
2940 const char *path1 = NULL;
2941 const char *path2 = NULL;
2942 u64 ino1 = 0, ino2 = 0;
2943 int pathlen1 = 0, pathlen2 = 0;
2944 bool freepath1 = false, freepath2 = false;
2945 struct dentry *old_dentry = NULL;
2946 int len;
2947 u16 releases;
2948 void *p, *end;
2949 int ret;
2950 bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
2951 u16 request_head_version = mds_supported_head_version(session);
2952
2953 ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
2954 req->r_parent, req->r_path1, req->r_ino1.ino,
2955 &path1, &pathlen1, &ino1, &freepath1,
2956 test_bit(CEPH_MDS_R_PARENT_LOCKED,
2957 &req->r_req_flags));
2958 if (ret < 0) {
2959 msg = ERR_PTR(ret);
2960 goto out;
2961 }
2962
2963 /* If r_old_dentry is set, then assume that its parent is locked */
2964 if (req->r_old_dentry &&
2965 !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
2966 old_dentry = req->r_old_dentry;
2967 ret = set_request_path_attr(mdsc, NULL, old_dentry,
2968 req->r_old_dentry_dir,
2969 req->r_path2, req->r_ino2.ino,
2970 &path2, &pathlen2, &ino2, &freepath2, true);
2971 if (ret < 0) {
2972 msg = ERR_PTR(ret);
2973 goto out_free1;
2974 }
2975
2976 req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
2977 if (IS_ERR(req->r_altname)) {
2978 msg = ERR_CAST(req->r_altname);
2979 req->r_altname = NULL;
2980 goto out_free2;
2981 }
2982
2983 /*
2984 * For old cephs without supporting the 32bit retry/fwd feature
2985 * it will copy the raw memories directly when decoding the
2986 * requests. While new cephs will decode the head depending the
2987 * version member, so we need to make sure it will be compatible
2988 * with them both.
2989 */
2990 if (legacy)
2991 len = sizeof(struct ceph_mds_request_head_legacy);
2992 else if (request_head_version == 1)
2993 len = sizeof(struct ceph_mds_request_head_old);
2994 else if (request_head_version == 2)
2995 len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
2996 else
2997 len = sizeof(struct ceph_mds_request_head);
2998
2999 /* filepaths */
3000 len += 2 * (1 + sizeof(u32) + sizeof(u64));
3001 len += pathlen1 + pathlen2;
3002
3003 /* cap releases */
3004 len += sizeof(struct ceph_mds_request_release) *
3005 (!!req->r_inode_drop + !!req->r_dentry_drop +
3006 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
3007
3008 if (req->r_dentry_drop)
3009 len += pathlen1;
3010 if (req->r_old_dentry_drop)
3011 len += pathlen2;
3012
3013 /* MClientRequest tail */
3014
3015 /* req->r_stamp */
3016 len += sizeof(struct ceph_timespec);
3017
3018 /* gid list */
3019 len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
3020
3021 /* alternate name */
3022 len += sizeof(u32) + req->r_altname_len;
3023
3024 /* fscrypt_auth */
3025 len += sizeof(u32); // fscrypt_auth
3026 if (req->r_fscrypt_auth)
3027 len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
3028
3029 /* fscrypt_file */
3030 len += sizeof(u32);
3031 if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
3032 len += sizeof(__le64);
3033
3034 msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
3035 if (!msg) {
3036 msg = ERR_PTR(-ENOMEM);
3037 goto out_free2;
3038 }
3039
3040 msg->hdr.tid = cpu_to_le64(req->r_tid);
3041
3042 lhead = find_legacy_request_head(msg->front.iov_base,
3043 session->s_con.peer_features);
3044
3045 if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
3046 !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
3047 pr_err_ratelimited_client(cl,
3048 "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
3049 " is not supported by MDS. Fail request with -EIO.\n");
3050
3051 ret = -EIO;
3052 goto out_err;
3053 }
3054
3055 /*
3056 * The ceph_mds_request_head_legacy didn't contain a version field, and
3057 * one was added when we moved the message version from 3->4.
3058 */
3059 if (legacy) {
3060 msg->hdr.version = cpu_to_le16(3);
3061 p = msg->front.iov_base + sizeof(*lhead);
3062 } else if (request_head_version == 1) {
3063 struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
3064
3065 msg->hdr.version = cpu_to_le16(4);
3066 ohead->version = cpu_to_le16(1);
3067 p = msg->front.iov_base + sizeof(*ohead);
3068 } else if (request_head_version == 2) {
3069 struct ceph_mds_request_head *nhead = msg->front.iov_base;
3070
3071 msg->hdr.version = cpu_to_le16(6);
3072 nhead->version = cpu_to_le16(2);
3073
3074 p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
3075 } else {
3076 struct ceph_mds_request_head *nhead = msg->front.iov_base;
3077 kuid_t owner_fsuid;
3078 kgid_t owner_fsgid;
3079
3080 msg->hdr.version = cpu_to_le16(6);
3081 nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> 3082 nhead->struct_len = sizeof(struct ceph_mds_request_head);
3083
3084 owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
3085 VFSUIDT_INIT(req->r_cred->fsuid));
3086 owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
3087 VFSGIDT_INIT(req->r_cred->fsgid));
3088 nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
3089 nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
3090 p = msg->front.iov_base + sizeof(*nhead);
3091 }
3092
3093 end = msg->front.iov_base + msg->front.iov_len;
3094
3095 lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
3096 lhead->op = cpu_to_le32(req->r_op);
3097 lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
3098 req->r_cred->fsuid));
3099 lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
3100 req->r_cred->fsgid));
3101 lhead->ino = cpu_to_le64(req->r_deleg_ino);
3102 lhead->args = req->r_args;
3103
3104 ceph_encode_filepath(&p, end, ino1, path1);
3105 ceph_encode_filepath(&p, end, ino2, path2);
3106
3107 /* make note of release offset, in case we need to replay */
3108 req->r_request_release_offset = p - msg->front.iov_base;
3109
3110 /* cap releases */
3111 releases = 0;
3112 if (req->r_inode_drop)
3113 releases += ceph_encode_inode_release(&p,
3114 req->r_inode ? req->r_inode : d_inode(req->r_dentry),
3115 mds, req->r_inode_drop, req->r_inode_unless,
3116 req->r_op == CEPH_MDS_OP_READDIR);
3117 if (req->r_dentry_drop) {
3118 ret = ceph_encode_dentry_release(&p, req->r_dentry,
3119 req->r_parent, mds, req->r_dentry_drop,
3120 req->r_dentry_unless);
3121 if (ret < 0)
3122 goto out_err;
3123 releases += ret;
3124 }
3125 if (req->r_old_dentry_drop) {
3126 ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
3127 req->r_old_dentry_dir, mds,
3128 req->r_old_dentry_drop,
3129 req->r_old_dentry_unless);
3130 if (ret < 0)
3131 goto out_err;
3132 releases += ret;
3133 }
3134 if (req->r_old_inode_drop)
3135 releases += ceph_encode_inode_release(&p,
3136 d_inode(req->r_old_dentry),
3137 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
3138
3139 if (drop_cap_releases) {
3140 releases = 0;
3141 p = msg->front.iov_base + req->r_request_release_offset;
3142 }
3143
3144 lhead->num_releases = cpu_to_le16(releases);
3145
3146 encode_mclientrequest_tail(&p, req);
3147
3148 if (WARN_ON_ONCE(p > end)) {
3149 ceph_msg_put(msg);
3150 msg = ERR_PTR(-ERANGE);
3151 goto out_free2;
3152 }
3153
3154 msg->front.iov_len = p - msg->front.iov_base;
3155 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
3156
3157 if (req->r_pagelist) {
3158 struct ceph_pagelist *pagelist = req->r_pagelist;
3159 ceph_msg_data_add_pagelist(msg, pagelist);
3160 msg->hdr.data_len = cpu_to_le32(pagelist->length);
3161 } else {
3162 msg->hdr.data_len = 0;
3163 }
3164
3165 msg->hdr.data_off = cpu_to_le16(0);
3166
3167 out_free2:
3168 if (freepath2)
3169 ceph_mdsc_free_path((char *)path2, pathlen2);
3170 out_free1:
3171 if (freepath1)
3172 ceph_mdsc_free_path((char *)path1, pathlen1);
3173 out:
3174 return msg;
3175 out_err:
3176 ceph_msg_put(msg);
3177 msg = ERR_PTR(ret);
3178 goto out_free2;
3179 }
3180

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-08-05 13:09:03

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Sat, Aug 5, 2023 at 3:56 AM kernel test robot <[email protected]> wrote:
>
> Hi Alexander,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on ceph-client/testing]
> [cannot apply to ceph-client/for-linus brauner-vfs/vfs.all linus/master vfs-idmapping/for-next v6.5-rc4 next-20230804]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Alexander-Mikhalitsyn/fs-export-mnt_idmap_get-mnt_idmap_put/20230804-165330
> base: https://github.com/ceph/ceph-client.git testing
> patch link: https://lore.kernel.org/r/20230804084858.126104-4-aleksandr.mikhalitsyn%40canonical.com
> patch subject: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()
> config: um-randconfig-r091-20230730 (https://download.01.org/0day-ci/archive/20230805/[email protected]/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce: (https://download.01.org/0day-ci/archive/20230805/[email protected]/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <[email protected]>
> | Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
>
> sparse warnings: (new ones prefixed by >>)
> >> fs/ceph/mds_client.c:3082:35: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] struct_len @@ got unsigned long @@
> fs/ceph/mds_client.c:3082:35: sparse: expected restricted __le32 [usertype] struct_len
> fs/ceph/mds_client.c:3082:35: sparse: got unsigned long
>
> vim +3082 fs/ceph/mds_client.c
>
> 2927
> 2928 /*
> 2929 * called under mdsc->mutex
> 2930 */
> 2931 static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> 2932 struct ceph_mds_request *req,
> 2933 bool drop_cap_releases)
> 2934 {
> 2935 int mds = session->s_mds;
> 2936 struct ceph_mds_client *mdsc = session->s_mdsc;
> 2937 struct ceph_client *cl = mdsc->fsc->client;
> 2938 struct ceph_msg *msg;
> 2939 struct ceph_mds_request_head_legacy *lhead;
> 2940 const char *path1 = NULL;
> 2941 const char *path2 = NULL;
> 2942 u64 ino1 = 0, ino2 = 0;
> 2943 int pathlen1 = 0, pathlen2 = 0;
> 2944 bool freepath1 = false, freepath2 = false;
> 2945 struct dentry *old_dentry = NULL;
> 2946 int len;
> 2947 u16 releases;
> 2948 void *p, *end;
> 2949 int ret;
> 2950 bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> 2951 u16 request_head_version = mds_supported_head_version(session);
> 2952
> 2953 ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> 2954 req->r_parent, req->r_path1, req->r_ino1.ino,
> 2955 &path1, &pathlen1, &ino1, &freepath1,
> 2956 test_bit(CEPH_MDS_R_PARENT_LOCKED,
> 2957 &req->r_req_flags));
> 2958 if (ret < 0) {
> 2959 msg = ERR_PTR(ret);
> 2960 goto out;
> 2961 }
> 2962
> 2963 /* If r_old_dentry is set, then assume that its parent is locked */
> 2964 if (req->r_old_dentry &&
> 2965 !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
> 2966 old_dentry = req->r_old_dentry;
> 2967 ret = set_request_path_attr(mdsc, NULL, old_dentry,
> 2968 req->r_old_dentry_dir,
> 2969 req->r_path2, req->r_ino2.ino,
> 2970 &path2, &pathlen2, &ino2, &freepath2, true);
> 2971 if (ret < 0) {
> 2972 msg = ERR_PTR(ret);
> 2973 goto out_free1;
> 2974 }
> 2975
> 2976 req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
> 2977 if (IS_ERR(req->r_altname)) {
> 2978 msg = ERR_CAST(req->r_altname);
> 2979 req->r_altname = NULL;
> 2980 goto out_free2;
> 2981 }
> 2982
> 2983 /*
> 2984 * For old cephs without supporting the 32bit retry/fwd feature
> 2985 * it will copy the raw memories directly when decoding the
> 2986 * requests. While new cephs will decode the head depending the
> 2987 * version member, so we need to make sure it will be compatible
> 2988 * with them both.
> 2989 */
> 2990 if (legacy)
> 2991 len = sizeof(struct ceph_mds_request_head_legacy);
> 2992 else if (request_head_version == 1)
> 2993 len = sizeof(struct ceph_mds_request_head_old);
> 2994 else if (request_head_version == 2)
> 2995 len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> 2996 else
> 2997 len = sizeof(struct ceph_mds_request_head);
> 2998
> 2999 /* filepaths */
> 3000 len += 2 * (1 + sizeof(u32) + sizeof(u64));
> 3001 len += pathlen1 + pathlen2;
> 3002
> 3003 /* cap releases */
> 3004 len += sizeof(struct ceph_mds_request_release) *
> 3005 (!!req->r_inode_drop + !!req->r_dentry_drop +
> 3006 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
> 3007
> 3008 if (req->r_dentry_drop)
> 3009 len += pathlen1;
> 3010 if (req->r_old_dentry_drop)
> 3011 len += pathlen2;
> 3012
> 3013 /* MClientRequest tail */
> 3014
> 3015 /* req->r_stamp */
> 3016 len += sizeof(struct ceph_timespec);
> 3017
> 3018 /* gid list */
> 3019 len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
> 3020
> 3021 /* alternate name */
> 3022 len += sizeof(u32) + req->r_altname_len;
> 3023
> 3024 /* fscrypt_auth */
> 3025 len += sizeof(u32); // fscrypt_auth
> 3026 if (req->r_fscrypt_auth)
> 3027 len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
> 3028
> 3029 /* fscrypt_file */
> 3030 len += sizeof(u32);
> 3031 if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
> 3032 len += sizeof(__le64);
> 3033
> 3034 msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
> 3035 if (!msg) {
> 3036 msg = ERR_PTR(-ENOMEM);
> 3037 goto out_free2;
> 3038 }
> 3039
> 3040 msg->hdr.tid = cpu_to_le64(req->r_tid);
> 3041
> 3042 lhead = find_legacy_request_head(msg->front.iov_base,
> 3043 session->s_con.peer_features);
> 3044
> 3045 if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> 3046 !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> 3047 pr_err_ratelimited_client(cl,
> 3048 "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> 3049 " is not supported by MDS. Fail request with -EIO.\n");
> 3050
> 3051 ret = -EIO;
> 3052 goto out_err;
> 3053 }
> 3054
> 3055 /*
> 3056 * The ceph_mds_request_head_legacy didn't contain a version field, and
> 3057 * one was added when we moved the message version from 3->4.
> 3058 */
> 3059 if (legacy) {
> 3060 msg->hdr.version = cpu_to_le16(3);
> 3061 p = msg->front.iov_base + sizeof(*lhead);
> 3062 } else if (request_head_version == 1) {
> 3063 struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> 3064
> 3065 msg->hdr.version = cpu_to_le16(4);
> 3066 ohead->version = cpu_to_le16(1);
> 3067 p = msg->front.iov_base + sizeof(*ohead);
> 3068 } else if (request_head_version == 2) {
> 3069 struct ceph_mds_request_head *nhead = msg->front.iov_base;
> 3070
> 3071 msg->hdr.version = cpu_to_le16(6);
> 3072 nhead->version = cpu_to_le16(2);
> 3073
> 3074 p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> 3075 } else {
> 3076 struct ceph_mds_request_head *nhead = msg->front.iov_base;
> 3077 kuid_t owner_fsuid;
> 3078 kgid_t owner_fsgid;
> 3079
> 3080 msg->hdr.version = cpu_to_le16(6);
> 3081 nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> > 3082 nhead->struct_len = sizeof(struct ceph_mds_request_head);

should be
nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));

Fixed and pushed https://github.com/mihalicyn/linux/commits/fs.idmapped.ceph


> 3083
> 3084 owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> 3085 VFSUIDT_INIT(req->r_cred->fsuid));
> 3086 owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> 3087 VFSGIDT_INIT(req->r_cred->fsgid));
> 3088 nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> 3089 nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> 3090 p = msg->front.iov_base + sizeof(*nhead);
> 3091 }
> 3092
> 3093 end = msg->front.iov_base + msg->front.iov_len;
> 3094
> 3095 lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> 3096 lhead->op = cpu_to_le32(req->r_op);
> 3097 lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> 3098 req->r_cred->fsuid));
> 3099 lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> 3100 req->r_cred->fsgid));
> 3101 lhead->ino = cpu_to_le64(req->r_deleg_ino);
> 3102 lhead->args = req->r_args;
> 3103
> 3104 ceph_encode_filepath(&p, end, ino1, path1);
> 3105 ceph_encode_filepath(&p, end, ino2, path2);
> 3106
> 3107 /* make note of release offset, in case we need to replay */
> 3108 req->r_request_release_offset = p - msg->front.iov_base;
> 3109
> 3110 /* cap releases */
> 3111 releases = 0;
> 3112 if (req->r_inode_drop)
> 3113 releases += ceph_encode_inode_release(&p,
> 3114 req->r_inode ? req->r_inode : d_inode(req->r_dentry),
> 3115 mds, req->r_inode_drop, req->r_inode_unless,
> 3116 req->r_op == CEPH_MDS_OP_READDIR);
> 3117 if (req->r_dentry_drop) {
> 3118 ret = ceph_encode_dentry_release(&p, req->r_dentry,
> 3119 req->r_parent, mds, req->r_dentry_drop,
> 3120 req->r_dentry_unless);
> 3121 if (ret < 0)
> 3122 goto out_err;
> 3123 releases += ret;
> 3124 }
> 3125 if (req->r_old_dentry_drop) {
> 3126 ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
> 3127 req->r_old_dentry_dir, mds,
> 3128 req->r_old_dentry_drop,
> 3129 req->r_old_dentry_unless);
> 3130 if (ret < 0)
> 3131 goto out_err;
> 3132 releases += ret;
> 3133 }
> 3134 if (req->r_old_inode_drop)
> 3135 releases += ceph_encode_inode_release(&p,
> 3136 d_inode(req->r_old_dentry),
> 3137 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
> 3138
> 3139 if (drop_cap_releases) {
> 3140 releases = 0;
> 3141 p = msg->front.iov_base + req->r_request_release_offset;
> 3142 }
> 3143
> 3144 lhead->num_releases = cpu_to_le16(releases);
> 3145
> 3146 encode_mclientrequest_tail(&p, req);
> 3147
> 3148 if (WARN_ON_ONCE(p > end)) {
> 3149 ceph_msg_put(msg);
> 3150 msg = ERR_PTR(-ERANGE);
> 3151 goto out_free2;
> 3152 }
> 3153
> 3154 msg->front.iov_len = p - msg->front.iov_base;
> 3155 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> 3156
> 3157 if (req->r_pagelist) {
> 3158 struct ceph_pagelist *pagelist = req->r_pagelist;
> 3159 ceph_msg_data_add_pagelist(msg, pagelist);
> 3160 msg->hdr.data_len = cpu_to_le32(pagelist->length);
> 3161 } else {
> 3162 msg->hdr.data_len = 0;
> 3163 }
> 3164
> 3165 msg->hdr.data_off = cpu_to_le16(0);
> 3166
> 3167 out_free2:
> 3168 if (freepath2)
> 3169 ceph_mdsc_free_path((char *)path2, pathlen2);
> 3170 out_free1:
> 3171 if (freepath1)
> 3172 ceph_mdsc_free_path((char *)path1, pathlen1);
> 3173 out:
> 3174 return msg;
> 3175 out_err:
> 3176 ceph_msg_put(msg);
> 3177 msg = ERR_PTR(ret);
> 3178 goto out_free2;
> 3179 }
> 3180
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki

2023-08-07 01:46:28

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()


On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
> From: Christian Brauner <[email protected]>
>
> Inode operations that create a new filesystem object such as ->mknod,
> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> filesystem object.
>
> In order to ensure that the correct {g,u}id is used map the caller's
> fs{g,u}id for creation requests. This doesn't require complex changes.
> It suffices to pass in the relevant idmapping recorded in the request
> message. If this request message was triggered from an inode operation
> that creates filesystem objects it will have passed down the relevant
> idmaping. If this is a request message that was triggered from an inode
> operation that doens't need to take idmappings into account the initial
> idmapping is passed down which is an identity mapping.
>
> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> which adds two new fields (owner_{u,g}id) to the request head structure.
> So, we need to ensure that MDS supports it otherwise we need to fail
> any IO that comes through an idmapped mount because we can't process it
> in a proper way. MDS server without such an extension will use caller_{u,g}id
> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> values are unmapped. At the same time we can't map these fields with an
> idmapping as it can break UID/GID-based permission checks logic on the
> MDS side. This problem was described with a lot of details at [1], [2].
>
> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> [2] https://lore.kernel.org/all/[email protected]/
>
> Link: https://github.com/ceph/ceph/pull/52575
> Link: https://tracker.ceph.com/issues/62217
> Cc: Xiubo Li <[email protected]>
> Cc: Jeff Layton <[email protected]>
> Cc: Ilya Dryomov <[email protected]>
> Cc: [email protected]
> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> ---
> v7:
> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
> v8:
> - properly handled case when old MDS used with new kernel client
> ---
> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
> fs/ceph/mds_client.h | 5 +++-
> include/linux/ceph/ceph_fs.h | 5 +++-
> 3 files changed, 52 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 8829f55103da..41e4bf3811c4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> }
> }
>
> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
> +{
> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
> + return 1;
> +
> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
> + return 2;
> +
> + return CEPH_MDS_REQUEST_HEAD_VERSION;
> +}
> +
> static struct ceph_mds_request_head_legacy *
> find_legacy_request_head(void *p, u64 features)
> {
> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> {
> int mds = session->s_mds;
> struct ceph_mds_client *mdsc = session->s_mdsc;
> + struct ceph_client *cl = mdsc->fsc->client;
> struct ceph_msg *msg;
> struct ceph_mds_request_head_legacy *lhead;
> const char *path1 = NULL;
> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> void *p, *end;
> int ret;
> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> + u16 request_head_version = mds_supported_head_version(session);
>
> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> req->r_parent, req->r_path1, req->r_ino1.ino,
> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> */
> if (legacy)
> len = sizeof(struct ceph_mds_request_head_legacy);
> - else if (old_version)
> + else if (request_head_version == 1)
> len = sizeof(struct ceph_mds_request_head_old);
> + else if (request_head_version == 2)
> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> else
> len = sizeof(struct ceph_mds_request_head);
>
> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> lhead = find_legacy_request_head(msg->front.iov_base,
> session->s_con.peer_features);
>
> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> + pr_err_ratelimited_client(cl,
> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> + " is not supported by MDS. Fail request with -EIO.\n");
> +
> + ret = -EIO;
> + goto out_err;
> + }
> +
> /*
> * The ceph_mds_request_head_legacy didn't contain a version field, and
> * one was added when we moved the message version from 3->4.
> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> if (legacy) {
> msg->hdr.version = cpu_to_le16(3);
> p = msg->front.iov_base + sizeof(*lhead);
> - } else if (old_version) {
> + } else if (request_head_version == 1) {
> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
>
> msg->hdr.version = cpu_to_le16(4);
> ohead->version = cpu_to_le16(1);
> p = msg->front.iov_base + sizeof(*ohead);
> + } else if (request_head_version == 2) {
> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
> +
> + msg->hdr.version = cpu_to_le16(6);
> + nhead->version = cpu_to_le16(2);
> +
> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> } else {
> struct ceph_mds_request_head *nhead = msg->front.iov_base;
> + kuid_t owner_fsuid;
> + kgid_t owner_fsgid;
>
> msg->hdr.version = cpu_to_le16(6);
> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
> +
> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> + VFSUIDT_INIT(req->r_cred->fsuid));
> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> + VFSGIDT_INIT(req->r_cred->fsgid));
> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> p = msg->front.iov_base + sizeof(*nhead);
> }
>
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index e3bbf3ba8ee8..8f683e8203bd 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -33,8 +33,10 @@ enum ceph_feature_type {
> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> CEPHFS_FEATURE_OP_GETVXATTR,
> CEPHFS_FEATURE_32BITS_RETRY_FWD,
> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
>
> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> };
>
> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
> @@ -49,6 +51,7 @@ enum ceph_feature_type {
> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
> CEPHFS_FEATURE_OP_GETVXATTR, \
> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
> }
>
> /*
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 5f2301ee88bc..b91699b08f26 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
> union ceph_mds_request_args args;
> } __attribute__ ((packed));
>
> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
>
> struct ceph_mds_request_head_old {
> __le16 version; /* struct version */
> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
>
> __le32 ext_num_retry; /* new count retry attempts */
> __le32 ext_num_fwd; /* new count fwd attempts */
> +
> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */

Let's also initialize them to -1 for all the other requests as we do in
your PR.

Thanks

- Xiubo



> } __attribute__ ((packed));
>
> /* cap/lease release record */


2023-08-07 07:27:18

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
>
>
> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
> > From: Christian Brauner <[email protected]>
> >
> > Inode operations that create a new filesystem object such as ->mknod,
> > ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> > Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> > filesystem object.
> >
> > In order to ensure that the correct {g,u}id is used map the caller's
> > fs{g,u}id for creation requests. This doesn't require complex changes.
> > It suffices to pass in the relevant idmapping recorded in the request
> > message. If this request message was triggered from an inode operation
> > that creates filesystem objects it will have passed down the relevant
> > idmaping. If this is a request message that was triggered from an inode
> > operation that doens't need to take idmappings into account the initial
> > idmapping is passed down which is an identity mapping.
> >
> > This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> > which adds two new fields (owner_{u,g}id) to the request head structure.
> > So, we need to ensure that MDS supports it otherwise we need to fail
> > any IO that comes through an idmapped mount because we can't process it
> > in a proper way. MDS server without such an extension will use caller_{u,g}id
> > fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> > values are unmapped. At the same time we can't map these fields with an
> > idmapping as it can break UID/GID-based permission checks logic on the
> > MDS side. This problem was described with a lot of details at [1], [2].
> >
> > [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> > [2] https://lore.kernel.org/all/[email protected]/
> >
> > Link: https://github.com/ceph/ceph/pull/52575
> > Link: https://tracker.ceph.com/issues/62217
> > Cc: Xiubo Li <[email protected]>
> > Cc: Jeff Layton <[email protected]>
> > Cc: Ilya Dryomov <[email protected]>
> > Cc: [email protected]
> > Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> > Signed-off-by: Christian Brauner <[email protected]>
> > Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> > ---
> > v7:
> > - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
> > v8:
> > - properly handled case when old MDS used with new kernel client
> > ---
> > fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
> > fs/ceph/mds_client.h | 5 +++-
> > include/linux/ceph/ceph_fs.h | 5 +++-
> > 3 files changed, 52 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 8829f55103da..41e4bf3811c4 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> > }
> > }
> >
> > +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
> > +{
> > + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
> > + return 1;
> > +
> > + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
> > + return 2;
> > +
> > + return CEPH_MDS_REQUEST_HEAD_VERSION;
> > +}
> > +
> > static struct ceph_mds_request_head_legacy *
> > find_legacy_request_head(void *p, u64 features)
> > {
> > @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> > {
> > int mds = session->s_mds;
> > struct ceph_mds_client *mdsc = session->s_mdsc;
> > + struct ceph_client *cl = mdsc->fsc->client;
> > struct ceph_msg *msg;
> > struct ceph_mds_request_head_legacy *lhead;
> > const char *path1 = NULL;
> > @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> > void *p, *end;
> > int ret;
> > bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> > - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> > + u16 request_head_version = mds_supported_head_version(session);
> >
> > ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> > req->r_parent, req->r_path1, req->r_ino1.ino,
> > @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> > */
> > if (legacy)
> > len = sizeof(struct ceph_mds_request_head_legacy);
> > - else if (old_version)
> > + else if (request_head_version == 1)
> > len = sizeof(struct ceph_mds_request_head_old);
> > + else if (request_head_version == 2)
> > + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> > else
> > len = sizeof(struct ceph_mds_request_head);
> >
> > @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> > lhead = find_legacy_request_head(msg->front.iov_base,
> > session->s_con.peer_features);
> >
> > + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> > + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> > + pr_err_ratelimited_client(cl,
> > + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> > + " is not supported by MDS. Fail request with -EIO.\n");
> > +
> > + ret = -EIO;
> > + goto out_err;
> > + }
> > +
> > /*
> > * The ceph_mds_request_head_legacy didn't contain a version field, and
> > * one was added when we moved the message version from 3->4.
> > @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> > if (legacy) {
> > msg->hdr.version = cpu_to_le16(3);
> > p = msg->front.iov_base + sizeof(*lhead);
> > - } else if (old_version) {
> > + } else if (request_head_version == 1) {
> > struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >
> > msg->hdr.version = cpu_to_le16(4);
> > ohead->version = cpu_to_le16(1);
> > p = msg->front.iov_base + sizeof(*ohead);
> > + } else if (request_head_version == 2) {
> > + struct ceph_mds_request_head *nhead = msg->front.iov_base;
> > +
> > + msg->hdr.version = cpu_to_le16(6);
> > + nhead->version = cpu_to_le16(2);
> > +
> > + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> > } else {
> > struct ceph_mds_request_head *nhead = msg->front.iov_base;
> > + kuid_t owner_fsuid;
> > + kgid_t owner_fsgid;
> >
> > msg->hdr.version = cpu_to_le16(6);
> > nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> > + nhead->struct_len = sizeof(struct ceph_mds_request_head);
> > +
> > + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> > + VFSUIDT_INIT(req->r_cred->fsuid));
> > + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> > + VFSGIDT_INIT(req->r_cred->fsgid));
> > + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> > + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> > p = msg->front.iov_base + sizeof(*nhead);
> > }
> >
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index e3bbf3ba8ee8..8f683e8203bd 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -33,8 +33,10 @@ enum ceph_feature_type {
> > CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> > CEPHFS_FEATURE_OP_GETVXATTR,
> > CEPHFS_FEATURE_32BITS_RETRY_FWD,
> > + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
> > + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >
> > - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> > + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> > };
> >
> > #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
> > @@ -49,6 +51,7 @@ enum ceph_feature_type {
> > CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
> > CEPHFS_FEATURE_OP_GETVXATTR, \
> > CEPHFS_FEATURE_32BITS_RETRY_FWD, \
> > + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
> > }
> >
> > /*
> > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > index 5f2301ee88bc..b91699b08f26 100644
> > --- a/include/linux/ceph/ceph_fs.h
> > +++ b/include/linux/ceph/ceph_fs.h
> > @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
> > union ceph_mds_request_args args;
> > } __attribute__ ((packed));
> >
> > -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
> > +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
> >
> > struct ceph_mds_request_head_old {
> > __le16 version; /* struct version */
> > @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
> >
> > __le32 ext_num_retry; /* new count retry attempts */
> > __le32 ext_num_fwd; /* new count fwd attempts */
> > +
> > + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
> > + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
>
> Let's also initialize them to -1 for all the other requests as we do in
> your PR.

They are always initialized already. As you can see from the code we
don't have any extra conditions
on filling these fields. We always fill them with an appropriate
UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
if mount idmapped then it's idmapped caller_uid/gid.

Kind regards,
Alex

>
> Thanks
>
> - Xiubo
>
>
>
> > } __attribute__ ((packed));
> >
> > /* cap/lease release record */
>

2023-08-07 11:55:31

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Mon, Aug 7, 2023 at 12:28 PM Xiubo Li <[email protected]> wrote:
>
>
> On 8/7/23 14:51, Aleksandr Mikhalitsyn wrote:
> > On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
> >>
> >> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
> >>> From: Christian Brauner <[email protected]>
> >>>
> >>> Inode operations that create a new filesystem object such as ->mknod,
> >>> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> >>> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> >>> filesystem object.
> >>>
> >>> In order to ensure that the correct {g,u}id is used map the caller's
> >>> fs{g,u}id for creation requests. This doesn't require complex changes.
> >>> It suffices to pass in the relevant idmapping recorded in the request
> >>> message. If this request message was triggered from an inode operation
> >>> that creates filesystem objects it will have passed down the relevant
> >>> idmaping. If this is a request message that was triggered from an inode
> >>> operation that doens't need to take idmappings into account the initial
> >>> idmapping is passed down which is an identity mapping.
> >>>
> >>> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> >>> which adds two new fields (owner_{u,g}id) to the request head structure.
> >>> So, we need to ensure that MDS supports it otherwise we need to fail
> >>> any IO that comes through an idmapped mount because we can't process it
> >>> in a proper way. MDS server without such an extension will use caller_{u,g}id
> >>> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> >>> values are unmapped. At the same time we can't map these fields with an
> >>> idmapping as it can break UID/GID-based permission checks logic on the
> >>> MDS side. This problem was described with a lot of details at [1], [2].
> >>>
> >>> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> >>> [2] https://lore.kernel.org/all/[email protected]/
> >>>
> >>> Link: https://github.com/ceph/ceph/pull/52575
> >>> Link: https://tracker.ceph.com/issues/62217
> >>> Cc: Xiubo Li <[email protected]>
> >>> Cc: Jeff Layton <[email protected]>
> >>> Cc: Ilya Dryomov <[email protected]>
> >>> Cc: [email protected]
> >>> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> >>> Signed-off-by: Christian Brauner <[email protected]>
> >>> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> >>> ---
> >>> v7:
> >>> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
> >>> v8:
> >>> - properly handled case when old MDS used with new kernel client
> >>> ---
> >>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
> >>> fs/ceph/mds_client.h | 5 +++-
> >>> include/linux/ceph/ceph_fs.h | 5 +++-
> >>> 3 files changed, 52 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>> index 8829f55103da..41e4bf3811c4 100644
> >>> --- a/fs/ceph/mds_client.c
> >>> +++ b/fs/ceph/mds_client.c
> >>> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> >>> }
> >>> }
> >>>
> >>> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
> >>> +{
> >>> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
> >>> + return 1;
> >>> +
> >>> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
> >>> + return 2;
> >>> +
> >>> + return CEPH_MDS_REQUEST_HEAD_VERSION;
> >>> +}
> >>> +
> >>> static struct ceph_mds_request_head_legacy *
> >>> find_legacy_request_head(void *p, u64 features)
> >>> {
> >>> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> {
> >>> int mds = session->s_mds;
> >>> struct ceph_mds_client *mdsc = session->s_mdsc;
> >>> + struct ceph_client *cl = mdsc->fsc->client;
> >>> struct ceph_msg *msg;
> >>> struct ceph_mds_request_head_legacy *lhead;
> >>> const char *path1 = NULL;
> >>> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> void *p, *end;
> >>> int ret;
> >>> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> >>> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>> + u16 request_head_version = mds_supported_head_version(session);
> >>>
> >>> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> >>> req->r_parent, req->r_path1, req->r_ino1.ino,
> >>> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> */
> >>> if (legacy)
> >>> len = sizeof(struct ceph_mds_request_head_legacy);
> >>> - else if (old_version)
> >>> + else if (request_head_version == 1)
> >>> len = sizeof(struct ceph_mds_request_head_old);
> >>> + else if (request_head_version == 2)
> >>> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>> else
> >>> len = sizeof(struct ceph_mds_request_head);
> >>>
> >>> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> lhead = find_legacy_request_head(msg->front.iov_base,
> >>> session->s_con.peer_features);
> >>>
> >>> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> >>> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> >>> + pr_err_ratelimited_client(cl,
> >>> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> >>> + " is not supported by MDS. Fail request with -EIO.\n");
> >>> +
> >>> + ret = -EIO;
> >>> + goto out_err;
> >>> + }
> >>> +
> >>> /*
> >>> * The ceph_mds_request_head_legacy didn't contain a version field, and
> >>> * one was added when we moved the message version from 3->4.
> >>> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> if (legacy) {
> >>> msg->hdr.version = cpu_to_le16(3);
> >>> p = msg->front.iov_base + sizeof(*lhead);
> >>> - } else if (old_version) {
> >>> + } else if (request_head_version == 1) {
> >>> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >>>
> >>> msg->hdr.version = cpu_to_le16(4);
> >>> ohead->version = cpu_to_le16(1);
> >>> p = msg->front.iov_base + sizeof(*ohead);
> >>> + } else if (request_head_version == 2) {
> >>> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>> +
> >>> + msg->hdr.version = cpu_to_le16(6);
> >>> + nhead->version = cpu_to_le16(2);
> >>> +
> >>> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>> } else {
> >>> struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>> + kuid_t owner_fsuid;
> >>> + kgid_t owner_fsgid;
> >>>
> >>> msg->hdr.version = cpu_to_le16(6);
> >>> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >>> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
> >>> +
> >>> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> >>> + VFSUIDT_INIT(req->r_cred->fsuid));
> >>> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> >>> + VFSGIDT_INIT(req->r_cred->fsgid));
> >>> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> >>> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> >>> p = msg->front.iov_base + sizeof(*nhead);
> >>> }
> >>>
> >>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >>> index e3bbf3ba8ee8..8f683e8203bd 100644
> >>> --- a/fs/ceph/mds_client.h
> >>> +++ b/fs/ceph/mds_client.h
> >>> @@ -33,8 +33,10 @@ enum ceph_feature_type {
> >>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> >>> CEPHFS_FEATURE_OP_GETVXATTR,
> >>> CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
> >>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>>
> >>> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>> };
> >>>
> >>> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
> >>> @@ -49,6 +51,7 @@ enum ceph_feature_type {
> >>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
> >>> CEPHFS_FEATURE_OP_GETVXATTR, \
> >>> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
> >>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
> >>> }
> >>>
> >>> /*
> >>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >>> index 5f2301ee88bc..b91699b08f26 100644
> >>> --- a/include/linux/ceph/ceph_fs.h
> >>> +++ b/include/linux/ceph/ceph_fs.h
> >>> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
> >>> union ceph_mds_request_args args;
> >>> } __attribute__ ((packed));
> >>>
> >>> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
> >>> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
> >>>
> >>> struct ceph_mds_request_head_old {
> >>> __le16 version; /* struct version */
> >>> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
> >>>
> >>> __le32 ext_num_retry; /* new count retry attempts */
> >>> __le32 ext_num_fwd; /* new count fwd attempts */
> >>> +
> >>> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
> >>> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
> >> Let's also initialize them to -1 for all the other requests as we do in
> >> your PR.
> > They are always initialized already. As you can see from the code we
> > don't have any extra conditions
> > on filling these fields. We always fill them with an appropriate
> > UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
> > if mount idmapped then it's idmapped caller_uid/gid.
>
> Then in kclient all the request will always initialized the
> 'owner_{uid/gid}' with 'caller_{uid/gid}'. While in userspace libcephfs
> it will only set them for 'create/mknod/mkdir/symlink` instead.
>
> I'd prefer to make them consistent, which is what I am still focusing
> on, to make them easier to read and comparing when troubles hooting.

Dear Xiubo,

Sure, I will do it.

Couldn't you please review all the rest of the patches before I fix
this particular thing?
It will allow me to fix and send -v10 with all required fixes
incorporated in it.

Kind regards,
Alex

>
> Thanks
>
> - Xiubo
>
> > Kind regards,
> > Alex
> >
> >> Thanks
> >>
> >> - Xiubo
> >>
> >>
> >>
> >>> } __attribute__ ((packed));
> >>>
> >>> /* cap/lease release record */
>

2023-08-07 12:01:59

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()


On 8/7/23 14:51, Aleksandr Mikhalitsyn wrote:
> On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
>>
>> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
>>> From: Christian Brauner <[email protected]>
>>>
>>> Inode operations that create a new filesystem object such as ->mknod,
>>> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
>>> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
>>> filesystem object.
>>>
>>> In order to ensure that the correct {g,u}id is used map the caller's
>>> fs{g,u}id for creation requests. This doesn't require complex changes.
>>> It suffices to pass in the relevant idmapping recorded in the request
>>> message. If this request message was triggered from an inode operation
>>> that creates filesystem objects it will have passed down the relevant
>>> idmaping. If this is a request message that was triggered from an inode
>>> operation that doens't need to take idmappings into account the initial
>>> idmapping is passed down which is an identity mapping.
>>>
>>> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
>>> which adds two new fields (owner_{u,g}id) to the request head structure.
>>> So, we need to ensure that MDS supports it otherwise we need to fail
>>> any IO that comes through an idmapped mount because we can't process it
>>> in a proper way. MDS server without such an extension will use caller_{u,g}id
>>> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
>>> values are unmapped. At the same time we can't map these fields with an
>>> idmapping as it can break UID/GID-based permission checks logic on the
>>> MDS side. This problem was described with a lot of details at [1], [2].
>>>
>>> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
>>> [2] https://lore.kernel.org/all/[email protected]/
>>>
>>> Link: https://github.com/ceph/ceph/pull/52575
>>> Link: https://tracker.ceph.com/issues/62217
>>> Cc: Xiubo Li <[email protected]>
>>> Cc: Jeff Layton <[email protected]>
>>> Cc: Ilya Dryomov <[email protected]>
>>> Cc: [email protected]
>>> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
>>> Signed-off-by: Christian Brauner <[email protected]>
>>> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
>>> ---
>>> v7:
>>> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
>>> v8:
>>> - properly handled case when old MDS used with new kernel client
>>> ---
>>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
>>> fs/ceph/mds_client.h | 5 +++-
>>> include/linux/ceph/ceph_fs.h | 5 +++-
>>> 3 files changed, 52 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>> index 8829f55103da..41e4bf3811c4 100644
>>> --- a/fs/ceph/mds_client.c
>>> +++ b/fs/ceph/mds_client.c
>>> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
>>> }
>>> }
>>>
>>> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
>>> +{
>>> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
>>> + return 1;
>>> +
>>> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
>>> + return 2;
>>> +
>>> + return CEPH_MDS_REQUEST_HEAD_VERSION;
>>> +}
>>> +
>>> static struct ceph_mds_request_head_legacy *
>>> find_legacy_request_head(void *p, u64 features)
>>> {
>>> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>> {
>>> int mds = session->s_mds;
>>> struct ceph_mds_client *mdsc = session->s_mdsc;
>>> + struct ceph_client *cl = mdsc->fsc->client;
>>> struct ceph_msg *msg;
>>> struct ceph_mds_request_head_legacy *lhead;
>>> const char *path1 = NULL;
>>> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>> void *p, *end;
>>> int ret;
>>> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
>>> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>>> + u16 request_head_version = mds_supported_head_version(session);
>>>
>>> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>>> req->r_parent, req->r_path1, req->r_ino1.ino,
>>> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>> */
>>> if (legacy)
>>> len = sizeof(struct ceph_mds_request_head_legacy);
>>> - else if (old_version)
>>> + else if (request_head_version == 1)
>>> len = sizeof(struct ceph_mds_request_head_old);
>>> + else if (request_head_version == 2)
>>> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
>>> else
>>> len = sizeof(struct ceph_mds_request_head);
>>>
>>> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>> lhead = find_legacy_request_head(msg->front.iov_base,
>>> session->s_con.peer_features);
>>>
>>> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
>>> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
>>> + pr_err_ratelimited_client(cl,
>>> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
>>> + " is not supported by MDS. Fail request with -EIO.\n");
>>> +
>>> + ret = -EIO;
>>> + goto out_err;
>>> + }
>>> +
>>> /*
>>> * The ceph_mds_request_head_legacy didn't contain a version field, and
>>> * one was added when we moved the message version from 3->4.
>>> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>> if (legacy) {
>>> msg->hdr.version = cpu_to_le16(3);
>>> p = msg->front.iov_base + sizeof(*lhead);
>>> - } else if (old_version) {
>>> + } else if (request_head_version == 1) {
>>> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
>>>
>>> msg->hdr.version = cpu_to_le16(4);
>>> ohead->version = cpu_to_le16(1);
>>> p = msg->front.iov_base + sizeof(*ohead);
>>> + } else if (request_head_version == 2) {
>>> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
>>> +
>>> + msg->hdr.version = cpu_to_le16(6);
>>> + nhead->version = cpu_to_le16(2);
>>> +
>>> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
>>> } else {
>>> struct ceph_mds_request_head *nhead = msg->front.iov_base;
>>> + kuid_t owner_fsuid;
>>> + kgid_t owner_fsgid;
>>>
>>> msg->hdr.version = cpu_to_le16(6);
>>> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
>>> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
>>> +
>>> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
>>> + VFSUIDT_INIT(req->r_cred->fsuid));
>>> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
>>> + VFSGIDT_INIT(req->r_cred->fsgid));
>>> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
>>> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
>>> p = msg->front.iov_base + sizeof(*nhead);
>>> }
>>>
>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>> index e3bbf3ba8ee8..8f683e8203bd 100644
>>> --- a/fs/ceph/mds_client.h
>>> +++ b/fs/ceph/mds_client.h
>>> @@ -33,8 +33,10 @@ enum ceph_feature_type {
>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
>>> CEPHFS_FEATURE_OP_GETVXATTR,
>>> CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
>>>
>>> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
>>> };
>>>
>>> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
>>> @@ -49,6 +51,7 @@ enum ceph_feature_type {
>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
>>> CEPHFS_FEATURE_OP_GETVXATTR, \
>>> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
>>> }
>>>
>>> /*
>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>> index 5f2301ee88bc..b91699b08f26 100644
>>> --- a/include/linux/ceph/ceph_fs.h
>>> +++ b/include/linux/ceph/ceph_fs.h
>>> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
>>> union ceph_mds_request_args args;
>>> } __attribute__ ((packed));
>>>
>>> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
>>> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
>>>
>>> struct ceph_mds_request_head_old {
>>> __le16 version; /* struct version */
>>> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
>>>
>>> __le32 ext_num_retry; /* new count retry attempts */
>>> __le32 ext_num_fwd; /* new count fwd attempts */
>>> +
>>> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
>>> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
>> Let's also initialize them to -1 for all the other requests as we do in
>> your PR.
> They are always initialized already. As you can see from the code we
> don't have any extra conditions
> on filling these fields. We always fill them with an appropriate
> UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
> if mount idmapped then it's idmapped caller_uid/gid.

Then in kclient all the request will always initialized the
'owner_{uid/gid}' with 'caller_{uid/gid}'. While in userspace libcephfs
it will only set them for 'create/mknod/mkdir/symlink` instead.

I'd prefer to make them consistent, which is what I am still focusing
on, to make them easier to read and comparing when troubles hooting.

Thanks

- Xiubo

> Kind regards,
> Alex
>
>> Thanks
>>
>> - Xiubo
>>
>>
>>
>>> } __attribute__ ((packed));
>>>
>>> /* cap/lease release record */


2023-08-07 12:29:37

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()


On 8/7/23 18:34, Aleksandr Mikhalitsyn wrote:
> On Mon, Aug 7, 2023 at 12:28 PM Xiubo Li <[email protected]> wrote:
>>
>> On 8/7/23 14:51, Aleksandr Mikhalitsyn wrote:
>>> On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
>>>> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
>>>>> From: Christian Brauner <[email protected]>
>>>>>
>>>>> Inode operations that create a new filesystem object such as ->mknod,
>>>>> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
>>>>> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
>>>>> filesystem object.
>>>>>
>>>>> In order to ensure that the correct {g,u}id is used map the caller's
>>>>> fs{g,u}id for creation requests. This doesn't require complex changes.
>>>>> It suffices to pass in the relevant idmapping recorded in the request
>>>>> message. If this request message was triggered from an inode operation
>>>>> that creates filesystem objects it will have passed down the relevant
>>>>> idmaping. If this is a request message that was triggered from an inode
>>>>> operation that doens't need to take idmappings into account the initial
>>>>> idmapping is passed down which is an identity mapping.
>>>>>
>>>>> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
>>>>> which adds two new fields (owner_{u,g}id) to the request head structure.
>>>>> So, we need to ensure that MDS supports it otherwise we need to fail
>>>>> any IO that comes through an idmapped mount because we can't process it
>>>>> in a proper way. MDS server without such an extension will use caller_{u,g}id
>>>>> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
>>>>> values are unmapped. At the same time we can't map these fields with an
>>>>> idmapping as it can break UID/GID-based permission checks logic on the
>>>>> MDS side. This problem was described with a lot of details at [1], [2].
>>>>>
>>>>> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
>>>>> [2] https://lore.kernel.org/all/[email protected]/
>>>>>
>>>>> Link: https://github.com/ceph/ceph/pull/52575
>>>>> Link: https://tracker.ceph.com/issues/62217
>>>>> Cc: Xiubo Li <[email protected]>
>>>>> Cc: Jeff Layton <[email protected]>
>>>>> Cc: Ilya Dryomov <[email protected]>
>>>>> Cc: [email protected]
>>>>> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
>>>>> Signed-off-by: Christian Brauner <[email protected]>
>>>>> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
>>>>> ---
>>>>> v7:
>>>>> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
>>>>> v8:
>>>>> - properly handled case when old MDS used with new kernel client
>>>>> ---
>>>>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
>>>>> fs/ceph/mds_client.h | 5 +++-
>>>>> include/linux/ceph/ceph_fs.h | 5 +++-
>>>>> 3 files changed, 52 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>> index 8829f55103da..41e4bf3811c4 100644
>>>>> --- a/fs/ceph/mds_client.c
>>>>> +++ b/fs/ceph/mds_client.c
>>>>> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
>>>>> }
>>>>> }
>>>>>
>>>>> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
>>>>> +{
>>>>> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
>>>>> + return 1;
>>>>> +
>>>>> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
>>>>> + return 2;
>>>>> +
>>>>> + return CEPH_MDS_REQUEST_HEAD_VERSION;
>>>>> +}
>>>>> +
>>>>> static struct ceph_mds_request_head_legacy *
>>>>> find_legacy_request_head(void *p, u64 features)
>>>>> {
>>>>> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>>>> {
>>>>> int mds = session->s_mds;
>>>>> struct ceph_mds_client *mdsc = session->s_mdsc;
>>>>> + struct ceph_client *cl = mdsc->fsc->client;
>>>>> struct ceph_msg *msg;
>>>>> struct ceph_mds_request_head_legacy *lhead;
>>>>> const char *path1 = NULL;
>>>>> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>>>> void *p, *end;
>>>>> int ret;
>>>>> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
>>>>> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>>>>> + u16 request_head_version = mds_supported_head_version(session);
>>>>>
>>>>> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>>>>> req->r_parent, req->r_path1, req->r_ino1.ino,
>>>>> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>>>> */
>>>>> if (legacy)
>>>>> len = sizeof(struct ceph_mds_request_head_legacy);
>>>>> - else if (old_version)
>>>>> + else if (request_head_version == 1)
>>>>> len = sizeof(struct ceph_mds_request_head_old);
>>>>> + else if (request_head_version == 2)
>>>>> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
>>>>> else
>>>>> len = sizeof(struct ceph_mds_request_head);
>>>>>
>>>>> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>>>> lhead = find_legacy_request_head(msg->front.iov_base,
>>>>> session->s_con.peer_features);
>>>>>
>>>>> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
>>>>> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
>>>>> + pr_err_ratelimited_client(cl,
>>>>> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
>>>>> + " is not supported by MDS. Fail request with -EIO.\n");
>>>>> +
>>>>> + ret = -EIO;
>>>>> + goto out_err;
>>>>> + }
>>>>> +
>>>>> /*
>>>>> * The ceph_mds_request_head_legacy didn't contain a version field, and
>>>>> * one was added when we moved the message version from 3->4.
>>>>> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>>>> if (legacy) {
>>>>> msg->hdr.version = cpu_to_le16(3);
>>>>> p = msg->front.iov_base + sizeof(*lhead);
>>>>> - } else if (old_version) {
>>>>> + } else if (request_head_version == 1) {
>>>>> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
>>>>>
>>>>> msg->hdr.version = cpu_to_le16(4);
>>>>> ohead->version = cpu_to_le16(1);
>>>>> p = msg->front.iov_base + sizeof(*ohead);
>>>>> + } else if (request_head_version == 2) {
>>>>> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
>>>>> +
>>>>> + msg->hdr.version = cpu_to_le16(6);
>>>>> + nhead->version = cpu_to_le16(2);
>>>>> +
>>>>> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
>>>>> } else {
>>>>> struct ceph_mds_request_head *nhead = msg->front.iov_base;
>>>>> + kuid_t owner_fsuid;
>>>>> + kgid_t owner_fsgid;
>>>>>
>>>>> msg->hdr.version = cpu_to_le16(6);
>>>>> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
>>>>> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
>>>>> +
>>>>> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
>>>>> + VFSUIDT_INIT(req->r_cred->fsuid));
>>>>> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
>>>>> + VFSGIDT_INIT(req->r_cred->fsgid));
>>>>> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
>>>>> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
>>>>> p = msg->front.iov_base + sizeof(*nhead);
>>>>> }
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>> index e3bbf3ba8ee8..8f683e8203bd 100644
>>>>> --- a/fs/ceph/mds_client.h
>>>>> +++ b/fs/ceph/mds_client.h
>>>>> @@ -33,8 +33,10 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
>>>>> CEPHFS_FEATURE_OP_GETVXATTR,
>>>>> CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>>>> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
>>>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
>>>>>
>>>>> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>>>> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
>>>>> };
>>>>>
>>>>> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
>>>>> @@ -49,6 +51,7 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
>>>>> CEPHFS_FEATURE_OP_GETVXATTR, \
>>>>> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
>>>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
>>>>> }
>>>>>
>>>>> /*
>>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>>> index 5f2301ee88bc..b91699b08f26 100644
>>>>> --- a/include/linux/ceph/ceph_fs.h
>>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>>> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
>>>>> union ceph_mds_request_args args;
>>>>> } __attribute__ ((packed));
>>>>>
>>>>> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
>>>>> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
>>>>>
>>>>> struct ceph_mds_request_head_old {
>>>>> __le16 version; /* struct version */
>>>>> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
>>>>>
>>>>> __le32 ext_num_retry; /* new count retry attempts */
>>>>> __le32 ext_num_fwd; /* new count fwd attempts */
>>>>> +
>>>>> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
>>>>> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
>>>> Let's also initialize them to -1 for all the other requests as we do in
>>>> your PR.
>>> They are always initialized already. As you can see from the code we
>>> don't have any extra conditions
>>> on filling these fields. We always fill them with an appropriate
>>> UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
>>> if mount idmapped then it's idmapped caller_uid/gid.
>> Then in kclient all the request will always initialized the
>> 'owner_{uid/gid}' with 'caller_{uid/gid}'. While in userspace libcephfs
>> it will only set them for 'create/mknod/mkdir/symlink` instead.
>>
>> I'd prefer to make them consistent, which is what I am still focusing
>> on, to make them easier to read and comparing when troubles hooting.
> Dear Xiubo,
>
> Sure, I will do it.
>
> Couldn't you please review all the rest of the patches before I fix
> this particular thing?
> It will allow me to fix and send -v10 with all required fixes
> incorporated in it.

I have gone through them all and they LGTM.

Thanks

- Xiubo


> Kind regards,
> Alex
>
>> Thanks
>>
>> - Xiubo
>>
>>> Kind regards,
>>> Alex
>>>
>>>> Thanks
>>>>
>>>> - Xiubo
>>>>
>>>>
>>>>
>>>>> } __attribute__ ((packed));
>>>>>
>>>>> /* cap/lease release record */


2023-08-07 13:10:26

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Mon, Aug 7, 2023 at 1:21 PM Xiubo Li <[email protected]> wrote:
>
>
> On 8/7/23 18:34, Aleksandr Mikhalitsyn wrote:
> > On Mon, Aug 7, 2023 at 12:28 PM Xiubo Li <[email protected]> wrote:
> >>
> >> On 8/7/23 14:51, Aleksandr Mikhalitsyn wrote:
> >>> On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
> >>>> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
> >>>>> From: Christian Brauner <[email protected]>
> >>>>>
> >>>>> Inode operations that create a new filesystem object such as ->mknod,
> >>>>> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> >>>>> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> >>>>> filesystem object.
> >>>>>
> >>>>> In order to ensure that the correct {g,u}id is used map the caller's
> >>>>> fs{g,u}id for creation requests. This doesn't require complex changes.
> >>>>> It suffices to pass in the relevant idmapping recorded in the request
> >>>>> message. If this request message was triggered from an inode operation
> >>>>> that creates filesystem objects it will have passed down the relevant
> >>>>> idmaping. If this is a request message that was triggered from an inode
> >>>>> operation that doens't need to take idmappings into account the initial
> >>>>> idmapping is passed down which is an identity mapping.
> >>>>>
> >>>>> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> >>>>> which adds two new fields (owner_{u,g}id) to the request head structure.
> >>>>> So, we need to ensure that MDS supports it otherwise we need to fail
> >>>>> any IO that comes through an idmapped mount because we can't process it
> >>>>> in a proper way. MDS server without such an extension will use caller_{u,g}id
> >>>>> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> >>>>> values are unmapped. At the same time we can't map these fields with an
> >>>>> idmapping as it can break UID/GID-based permission checks logic on the
> >>>>> MDS side. This problem was described with a lot of details at [1], [2].
> >>>>>
> >>>>> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> >>>>> [2] https://lore.kernel.org/all/[email protected]/
> >>>>>
> >>>>> Link: https://github.com/ceph/ceph/pull/52575
> >>>>> Link: https://tracker.ceph.com/issues/62217
> >>>>> Cc: Xiubo Li <[email protected]>
> >>>>> Cc: Jeff Layton <[email protected]>
> >>>>> Cc: Ilya Dryomov <[email protected]>
> >>>>> Cc: [email protected]
> >>>>> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> >>>>> Signed-off-by: Christian Brauner <[email protected]>
> >>>>> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> >>>>> ---
> >>>>> v7:
> >>>>> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
> >>>>> v8:
> >>>>> - properly handled case when old MDS used with new kernel client
> >>>>> ---
> >>>>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
> >>>>> fs/ceph/mds_client.h | 5 +++-
> >>>>> include/linux/ceph/ceph_fs.h | 5 +++-
> >>>>> 3 files changed, 52 insertions(+), 5 deletions(-)
> >>>>>
> >>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>>>> index 8829f55103da..41e4bf3811c4 100644
> >>>>> --- a/fs/ceph/mds_client.c
> >>>>> +++ b/fs/ceph/mds_client.c
> >>>>> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> >>>>> }
> >>>>> }
> >>>>>
> >>>>> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
> >>>>> +{
> >>>>> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
> >>>>> + return 1;
> >>>>> +
> >>>>> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
> >>>>> + return 2;
> >>>>> +
> >>>>> + return CEPH_MDS_REQUEST_HEAD_VERSION;
> >>>>> +}
> >>>>> +
> >>>>> static struct ceph_mds_request_head_legacy *
> >>>>> find_legacy_request_head(void *p, u64 features)
> >>>>> {
> >>>>> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>>>> {
> >>>>> int mds = session->s_mds;
> >>>>> struct ceph_mds_client *mdsc = session->s_mdsc;
> >>>>> + struct ceph_client *cl = mdsc->fsc->client;
> >>>>> struct ceph_msg *msg;
> >>>>> struct ceph_mds_request_head_legacy *lhead;
> >>>>> const char *path1 = NULL;
> >>>>> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>>>> void *p, *end;
> >>>>> int ret;
> >>>>> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> >>>>> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>>>> + u16 request_head_version = mds_supported_head_version(session);
> >>>>>
> >>>>> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> >>>>> req->r_parent, req->r_path1, req->r_ino1.ino,
> >>>>> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>>>> */
> >>>>> if (legacy)
> >>>>> len = sizeof(struct ceph_mds_request_head_legacy);
> >>>>> - else if (old_version)
> >>>>> + else if (request_head_version == 1)
> >>>>> len = sizeof(struct ceph_mds_request_head_old);
> >>>>> + else if (request_head_version == 2)
> >>>>> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>>>> else
> >>>>> len = sizeof(struct ceph_mds_request_head);
> >>>>>
> >>>>> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>>>> lhead = find_legacy_request_head(msg->front.iov_base,
> >>>>> session->s_con.peer_features);
> >>>>>
> >>>>> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> >>>>> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> >>>>> + pr_err_ratelimited_client(cl,
> >>>>> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> >>>>> + " is not supported by MDS. Fail request with -EIO.\n");
> >>>>> +
> >>>>> + ret = -EIO;
> >>>>> + goto out_err;
> >>>>> + }
> >>>>> +
> >>>>> /*
> >>>>> * The ceph_mds_request_head_legacy didn't contain a version field, and
> >>>>> * one was added when we moved the message version from 3->4.
> >>>>> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>>>> if (legacy) {
> >>>>> msg->hdr.version = cpu_to_le16(3);
> >>>>> p = msg->front.iov_base + sizeof(*lhead);
> >>>>> - } else if (old_version) {
> >>>>> + } else if (request_head_version == 1) {
> >>>>> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >>>>>
> >>>>> msg->hdr.version = cpu_to_le16(4);
> >>>>> ohead->version = cpu_to_le16(1);
> >>>>> p = msg->front.iov_base + sizeof(*ohead);
> >>>>> + } else if (request_head_version == 2) {
> >>>>> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>>>> +
> >>>>> + msg->hdr.version = cpu_to_le16(6);
> >>>>> + nhead->version = cpu_to_le16(2);
> >>>>> +
> >>>>> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>>>> } else {
> >>>>> struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>>>> + kuid_t owner_fsuid;
> >>>>> + kgid_t owner_fsgid;
> >>>>>
> >>>>> msg->hdr.version = cpu_to_le16(6);
> >>>>> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >>>>> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
> >>>>> +
> >>>>> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> >>>>> + VFSUIDT_INIT(req->r_cred->fsuid));
> >>>>> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> >>>>> + VFSGIDT_INIT(req->r_cred->fsgid));
> >>>>> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> >>>>> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> >>>>> p = msg->front.iov_base + sizeof(*nhead);
> >>>>> }
> >>>>>
> >>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >>>>> index e3bbf3ba8ee8..8f683e8203bd 100644
> >>>>> --- a/fs/ceph/mds_client.h
> >>>>> +++ b/fs/ceph/mds_client.h
> >>>>> @@ -33,8 +33,10 @@ enum ceph_feature_type {
> >>>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> >>>>> CEPHFS_FEATURE_OP_GETVXATTR,
> >>>>> CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>>>> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
> >>>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>>>>
> >>>>> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>>>> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>>>> };
> >>>>>
> >>>>> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
> >>>>> @@ -49,6 +51,7 @@ enum ceph_feature_type {
> >>>>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
> >>>>> CEPHFS_FEATURE_OP_GETVXATTR, \
> >>>>> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
> >>>>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
> >>>>> }
> >>>>>
> >>>>> /*
> >>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >>>>> index 5f2301ee88bc..b91699b08f26 100644
> >>>>> --- a/include/linux/ceph/ceph_fs.h
> >>>>> +++ b/include/linux/ceph/ceph_fs.h
> >>>>> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
> >>>>> union ceph_mds_request_args args;
> >>>>> } __attribute__ ((packed));
> >>>>>
> >>>>> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
> >>>>> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
> >>>>>
> >>>>> struct ceph_mds_request_head_old {
> >>>>> __le16 version; /* struct version */
> >>>>> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
> >>>>>
> >>>>> __le32 ext_num_retry; /* new count retry attempts */
> >>>>> __le32 ext_num_fwd; /* new count fwd attempts */
> >>>>> +
> >>>>> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
> >>>>> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
> >>>> Let's also initialize them to -1 for all the other requests as we do in
> >>>> your PR.
> >>> They are always initialized already. As you can see from the code we
> >>> don't have any extra conditions
> >>> on filling these fields. We always fill them with an appropriate
> >>> UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
> >>> if mount idmapped then it's idmapped caller_uid/gid.
> >> Then in kclient all the request will always initialized the
> >> 'owner_{uid/gid}' with 'caller_{uid/gid}'. While in userspace libcephfs
> >> it will only set them for 'create/mknod/mkdir/symlink` instead.
> >>
> >> I'd prefer to make them consistent, which is what I am still focusing
> >> on, to make them easier to read and comparing when troubles hooting.
> > Dear Xiubo,
> >
> > Sure, I will do it.
> >
> > Couldn't you please review all the rest of the patches before I fix
> > this particular thing?
> > It will allow me to fix and send -v10 with all required fixes
> > incorporated in it.
>
> I have gone through them all and they LGTM.

Thanks!

Kind regards,
Alex

>
> Thanks
>
> - Xiubo
>
>
> > Kind regards,
> > Alex
> >
> >> Thanks
> >>
> >> - Xiubo
> >>
> >>> Kind regards,
> >>> Alex
> >>>
> >>>> Thanks
> >>>>
> >>>> - Xiubo
> >>>>
> >>>>
> >>>>
> >>>>> } __attribute__ ((packed));
> >>>>>
> >>>>> /* cap/lease release record */
>

2023-08-07 13:53:56

by Aleksandr Mikhalitsyn

[permalink] [raw]
Subject: Re: [PATCH v9 03/12] ceph: handle idmapped mounts in create_request_message()

On Mon, Aug 7, 2023 at 12:28 PM Xiubo Li <[email protected]> wrote:
>
>
> On 8/7/23 14:51, Aleksandr Mikhalitsyn wrote:
> > On Mon, Aug 7, 2023 at 3:25 AM Xiubo Li <[email protected]> wrote:
> >>
> >> On 8/4/23 16:48, Alexander Mikhalitsyn wrote:
> >>> From: Christian Brauner <[email protected]>
> >>>
> >>> Inode operations that create a new filesystem object such as ->mknod,
> >>> ->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
> >>> Instead the caller's fs{g,u}id is used for the {g,u}id of the new
> >>> filesystem object.
> >>>
> >>> In order to ensure that the correct {g,u}id is used map the caller's
> >>> fs{g,u}id for creation requests. This doesn't require complex changes.
> >>> It suffices to pass in the relevant idmapping recorded in the request
> >>> message. If this request message was triggered from an inode operation
> >>> that creates filesystem objects it will have passed down the relevant
> >>> idmaping. If this is a request message that was triggered from an inode
> >>> operation that doens't need to take idmappings into account the initial
> >>> idmapping is passed down which is an identity mapping.
> >>>
> >>> This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
> >>> which adds two new fields (owner_{u,g}id) to the request head structure.
> >>> So, we need to ensure that MDS supports it otherwise we need to fail
> >>> any IO that comes through an idmapped mount because we can't process it
> >>> in a proper way. MDS server without such an extension will use caller_{u,g}id
> >>> fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
> >>> values are unmapped. At the same time we can't map these fields with an
> >>> idmapping as it can break UID/GID-based permission checks logic on the
> >>> MDS side. This problem was described with a lot of details at [1], [2].
> >>>
> >>> [1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
> >>> [2] https://lore.kernel.org/all/[email protected]/
> >>>
> >>> Link: https://github.com/ceph/ceph/pull/52575
> >>> Link: https://tracker.ceph.com/issues/62217
> >>> Cc: Xiubo Li <[email protected]>
> >>> Cc: Jeff Layton <[email protected]>
> >>> Cc: Ilya Dryomov <[email protected]>
> >>> Cc: [email protected]
> >>> Co-Developed-by: Alexander Mikhalitsyn <[email protected]>
> >>> Signed-off-by: Christian Brauner <[email protected]>
> >>> Signed-off-by: Alexander Mikhalitsyn <[email protected]>
> >>> ---
> >>> v7:
> >>> - reworked to use two new fields for owner UID/GID (https://github.com/ceph/ceph/pull/52575)
> >>> v8:
> >>> - properly handled case when old MDS used with new kernel client
> >>> ---
> >>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++++++---
> >>> fs/ceph/mds_client.h | 5 +++-
> >>> include/linux/ceph/ceph_fs.h | 5 +++-
> >>> 3 files changed, 52 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>> index 8829f55103da..41e4bf3811c4 100644
> >>> --- a/fs/ceph/mds_client.c
> >>> +++ b/fs/ceph/mds_client.c
> >>> @@ -2902,6 +2902,17 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> >>> }
> >>> }
> >>>
> >>> +static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
> >>> +{
> >>> + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
> >>> + return 1;
> >>> +
> >>> + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
> >>> + return 2;
> >>> +
> >>> + return CEPH_MDS_REQUEST_HEAD_VERSION;
> >>> +}
> >>> +
> >>> static struct ceph_mds_request_head_legacy *
> >>> find_legacy_request_head(void *p, u64 features)
> >>> {
> >>> @@ -2923,6 +2934,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> {
> >>> int mds = session->s_mds;
> >>> struct ceph_mds_client *mdsc = session->s_mdsc;
> >>> + struct ceph_client *cl = mdsc->fsc->client;
> >>> struct ceph_msg *msg;
> >>> struct ceph_mds_request_head_legacy *lhead;
> >>> const char *path1 = NULL;
> >>> @@ -2936,7 +2948,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> void *p, *end;
> >>> int ret;
> >>> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> >>> - bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>> + u16 request_head_version = mds_supported_head_version(session);
> >>>
> >>> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> >>> req->r_parent, req->r_path1, req->r_ino1.ino,
> >>> @@ -2977,8 +2989,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> */
> >>> if (legacy)
> >>> len = sizeof(struct ceph_mds_request_head_legacy);
> >>> - else if (old_version)
> >>> + else if (request_head_version == 1)
> >>> len = sizeof(struct ceph_mds_request_head_old);
> >>> + else if (request_head_version == 2)
> >>> + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>> else
> >>> len = sizeof(struct ceph_mds_request_head);
> >>>
> >>> @@ -3028,6 +3042,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> lhead = find_legacy_request_head(msg->front.iov_base,
> >>> session->s_con.peer_features);
> >>>
> >>> + if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
> >>> + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
> >>> + pr_err_ratelimited_client(cl,
> >>> + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
> >>> + " is not supported by MDS. Fail request with -EIO.\n");
> >>> +
> >>> + ret = -EIO;
> >>> + goto out_err;
> >>> + }
> >>> +
> >>> /*
> >>> * The ceph_mds_request_head_legacy didn't contain a version field, and
> >>> * one was added when we moved the message version from 3->4.
> >>> @@ -3035,17 +3059,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>> if (legacy) {
> >>> msg->hdr.version = cpu_to_le16(3);
> >>> p = msg->front.iov_base + sizeof(*lhead);
> >>> - } else if (old_version) {
> >>> + } else if (request_head_version == 1) {
> >>> struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >>>
> >>> msg->hdr.version = cpu_to_le16(4);
> >>> ohead->version = cpu_to_le16(1);
> >>> p = msg->front.iov_base + sizeof(*ohead);
> >>> + } else if (request_head_version == 2) {
> >>> + struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>> +
> >>> + msg->hdr.version = cpu_to_le16(6);
> >>> + nhead->version = cpu_to_le16(2);
> >>> +
> >>> + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
> >>> } else {
> >>> struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>> + kuid_t owner_fsuid;
> >>> + kgid_t owner_fsgid;
> >>>
> >>> msg->hdr.version = cpu_to_le16(6);
> >>> nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >>> + nhead->struct_len = sizeof(struct ceph_mds_request_head);
> >>> +
> >>> + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
> >>> + VFSUIDT_INIT(req->r_cred->fsuid));
> >>> + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
> >>> + VFSGIDT_INIT(req->r_cred->fsgid));
> >>> + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
> >>> + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
> >>> p = msg->front.iov_base + sizeof(*nhead);
> >>> }
> >>>
> >>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >>> index e3bbf3ba8ee8..8f683e8203bd 100644
> >>> --- a/fs/ceph/mds_client.h
> >>> +++ b/fs/ceph/mds_client.h
> >>> @@ -33,8 +33,10 @@ enum ceph_feature_type {
> >>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> >>> CEPHFS_FEATURE_OP_GETVXATTR,
> >>> CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>> + CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
> >>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>>
> >>> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
> >>> };
> >>>
> >>> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
> >>> @@ -49,6 +51,7 @@ enum ceph_feature_type {
> >>> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
> >>> CEPHFS_FEATURE_OP_GETVXATTR, \
> >>> CEPHFS_FEATURE_32BITS_RETRY_FWD, \
> >>> + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
> >>> }
> >>>
> >>> /*
> >>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >>> index 5f2301ee88bc..b91699b08f26 100644
> >>> --- a/include/linux/ceph/ceph_fs.h
> >>> +++ b/include/linux/ceph/ceph_fs.h
> >>> @@ -499,7 +499,7 @@ struct ceph_mds_request_head_legacy {
> >>> union ceph_mds_request_args args;
> >>> } __attribute__ ((packed));
> >>>
> >>> -#define CEPH_MDS_REQUEST_HEAD_VERSION 2
> >>> +#define CEPH_MDS_REQUEST_HEAD_VERSION 3
> >>>
> >>> struct ceph_mds_request_head_old {
> >>> __le16 version; /* struct version */
> >>> @@ -530,6 +530,9 @@ struct ceph_mds_request_head {
> >>>
> >>> __le32 ext_num_retry; /* new count retry attempts */
> >>> __le32 ext_num_fwd; /* new count fwd attempts */
> >>> +
> >>> + __le32 struct_len; /* to store size of struct ceph_mds_request_head */
> >>> + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */
> >> Let's also initialize them to -1 for all the other requests as we do in
> >> your PR.
> > They are always initialized already. As you can see from the code we
> > don't have any extra conditions
> > on filling these fields. We always fill them with an appropriate
> > UID/GID. If mount is not idmapped then it's just == caller_uid/gid,
> > if mount idmapped then it's idmapped caller_uid/gid.
>
> Then in kclient all the request will always initialized the
> 'owner_{uid/gid}' with 'caller_{uid/gid}'. While in userspace libcephfs
> it will only set them for 'create/mknod/mkdir/symlink` instead.
>
> I'd prefer to make them consistent, which is what I am still focusing
> on, to make them easier to read and comparing when troubles hooting.

Have fixed:
https://github.com/mihalicyn/linux/commit/5a5b590ca5aa9ec81d68ff60d77ea54fc86bf33a

Also have added appropriate checks in mkdir/atomic_open:
https://github.com/mihalicyn/linux/commit/bc1fa68f7143a58af8c181bbfab64edf0397dca5
https://github.com/mihalicyn/linux/commit/30e21387063710a10cdca15a5d840fcf8e1e6ccd

Will send v10 soon.

Kind regards,
Alex

>
> Thanks
>
> - Xiubo
>
> > Kind regards,
> > Alex
> >
> >> Thanks
> >>
> >> - Xiubo
> >>
> >>
> >>
> >>> } __attribute__ ((packed));
> >>>
> >>> /* cap/lease release record */
>