2022-06-29 23:10:38

by Khalid Aziz

[permalink] [raw]
Subject: [PATCH v2 6/9] mm/mshare: Add mmap operation

mmap is used to establish address range for mshare region and map the
region into process's address space. Add basic mmap operation that
supports setting address range. Also fix code to not allocate new
mm_struct for files in msharefs that exist for information and not
for defining a new mshare region.

Signed-off-by: Khalid Aziz <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
mm/mshare.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index d238b68b0576..088a6cab1e93 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -9,7 +9,8 @@
*
*
* Copyright (C) 2022 Oracle Corp. All rights reserved.
- * Author: Khalid Aziz <[email protected]>
+ * Authors: Khalid Aziz <[email protected]>
+ * Matthew Wilcox <[email protected]>
*
*/

@@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov)
return ret;
}

+static int
+msharefs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct mshare_data *info = file->private_data;
+ struct mm_struct *mm = info->mm;
+
+ /*
+ * If this mshare region has been set up once already, bail out
+ */
+ if (mm->mmap_base != 0)
+ return -EINVAL;
+
+ if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1))
+ return -EINVAL;
+
+ mm->mmap_base = vma->vm_start;
+ mm->task_size = vma->vm_end - vma->vm_start;
+ if (!mm->task_size)
+ mm->task_size--;
+ info->minfo->start = mm->mmap_base;
+ info->minfo->size = mm->task_size;
+ vma->vm_flags |= VM_SHARED_PT;
+ vma->vm_private_data = info;
+ return 0;
+}
+
static const struct file_operations msharefs_file_operations = {
.open = msharefs_open,
.read_iter = msharefs_read,
+ .mmap = msharefs_mmap,
.llseek = no_llseek,
};

@@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode)
goto err_free;
}
info->mm = mm;
- info->minfo = NULL;
+ info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL);
+ if (info->minfo == NULL) {
+ retval = -ENOMEM;
+ goto err_free;
+ }
+
refcount_set(&info->refcnt, 1);
inode->i_private = info;

@@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode)
err_free:
if (mm)
mmput(mm);
+ kfree(info->minfo);
kfree(info);
return retval;
}

static struct inode
*msharefs_get_inode(struct super_block *sb, const struct inode *dir,
- umode_t mode)
+ umode_t mode, bool newmm)
{
struct inode *inode = new_inode(sb);
if (inode) {
@@ -147,7 +181,7 @@ static struct inode
case S_IFREG:
inode->i_op = &msharefs_file_inode_ops;
inode->i_fop = &msharefs_file_operations;
- if (msharefs_fill_mm(inode) != 0) {
+ if (newmm && msharefs_fill_mm(inode) != 0) {
discard_new_inode(inode);
inode = ERR_PTR(-ENOMEM);
}
@@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct inode *inode;
int err = 0;

- inode = msharefs_get_inode(dir->i_sb, dir, mode);
+ inode = msharefs_get_inode(dir->i_sb, dir, mode, true);
if (IS_ERR(inode))
return PTR_ERR(inode);

@@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir,
if (!dentry)
return -ENOMEM;

- inode = msharefs_get_inode(s, dir, S_IFREG | files->mode);
+ inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false);
if (!inode) {
dput(dentry);
return -ENOMEM;
@@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &msharefs_d_ops;
sb->s_time_gran = 1;

- inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777);
+ inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false);
if (!inode) {
err = -ENOMEM;
goto out;
--
2.32.0


2022-06-30 22:09:42

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v2 6/9] mm/mshare: Add mmap operation

On Wed, Jun 29, 2022 at 04:53:57PM -0600, Khalid Aziz wrote:
> mmap is used to establish address range for mshare region and map the
> region into process's address space. Add basic mmap operation that
> supports setting address range. Also fix code to not allocate new
> mm_struct for files in msharefs that exist for information and not
> for defining a new mshare region.
>
> Signed-off-by: Khalid Aziz <[email protected]>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> mm/mshare.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 41 insertions(+), 7 deletions(-)
>
> diff --git a/mm/mshare.c b/mm/mshare.c
> index d238b68b0576..088a6cab1e93 100644
> --- a/mm/mshare.c
> +++ b/mm/mshare.c
> @@ -9,7 +9,8 @@
> *
> *
> * Copyright (C) 2022 Oracle Corp. All rights reserved.
> - * Author: Khalid Aziz <[email protected]>
> + * Authors: Khalid Aziz <[email protected]>
> + * Matthew Wilcox <[email protected]>
> *
> */
>
> @@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov)
> return ret;
> }
>
> +static int
> +msharefs_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> + struct mshare_data *info = file->private_data;
> + struct mm_struct *mm = info->mm;
> +
> + /*
> + * If this mshare region has been set up once already, bail out
> + */
> + if (mm->mmap_base != 0)
> + return -EINVAL;
> +
> + if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1))
> + return -EINVAL;
> +
> + mm->mmap_base = vma->vm_start;
> + mm->task_size = vma->vm_end - vma->vm_start;
> + if (!mm->task_size)
> + mm->task_size--;
> + info->minfo->start = mm->mmap_base;
> + info->minfo->size = mm->task_size;

So, uh, if the second mmap() caller decides to ignore the mshare_info,
should they get an -EINVAL here since the memory mappings won't be at
the same process virtual address?

> + vma->vm_flags |= VM_SHARED_PT;
> + vma->vm_private_data = info;
> + return 0;
> +}
> +
> static const struct file_operations msharefs_file_operations = {
> .open = msharefs_open,
> .read_iter = msharefs_read,
> + .mmap = msharefs_mmap,
> .llseek = no_llseek,
> };
>
> @@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode)
> goto err_free;
> }
> info->mm = mm;
> - info->minfo = NULL;
> + info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL);
> + if (info->minfo == NULL) {
> + retval = -ENOMEM;
> + goto err_free;
> + }
> +
> refcount_set(&info->refcnt, 1);
> inode->i_private = info;
>
> @@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode)
> err_free:
> if (mm)
> mmput(mm);
> + kfree(info->minfo);
> kfree(info);
> return retval;
> }
>
> static struct inode
> *msharefs_get_inode(struct super_block *sb, const struct inode *dir,
> - umode_t mode)
> + umode_t mode, bool newmm)
> {
> struct inode *inode = new_inode(sb);
> if (inode) {
> @@ -147,7 +181,7 @@ static struct inode
> case S_IFREG:
> inode->i_op = &msharefs_file_inode_ops;
> inode->i_fop = &msharefs_file_operations;
> - if (msharefs_fill_mm(inode) != 0) {
> + if (newmm && msharefs_fill_mm(inode) != 0) {
> discard_new_inode(inode);
> inode = ERR_PTR(-ENOMEM);
> }
> @@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
> struct inode *inode;
> int err = 0;
>
> - inode = msharefs_get_inode(dir->i_sb, dir, mode);
> + inode = msharefs_get_inode(dir->i_sb, dir, mode, true);
> if (IS_ERR(inode))
> return PTR_ERR(inode);
>
> @@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir,
> if (!dentry)
> return -ENOMEM;
>
> - inode = msharefs_get_inode(s, dir, S_IFREG | files->mode);
> + inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false);

I was wondering why the information files were getting their own
mshare_data.

TBH I'm not really sure what the difference is between mshare_data and
mshare_info, since those names are not especially distinct.

> if (!inode) {
> dput(dentry);
> return -ENOMEM;
> @@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
> sb->s_d_op = &msharefs_d_ops;
> sb->s_time_gran = 1;
>
> - inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777);
> + inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false);

Is it wise to default to world-writable? Surely whatever userspace
software wraps an msharefs can relax permissions as needed.

--D

> if (!inode) {
> err = -ENOMEM;
> goto out;
> --
> 2.32.0
>

2022-06-30 23:47:22

by Khalid Aziz

[permalink] [raw]
Subject: Re: [PATCH v2 6/9] mm/mshare: Add mmap operation

On 6/30/22 15:44, Darrick J. Wong wrote:
> On Wed, Jun 29, 2022 at 04:53:57PM -0600, Khalid Aziz wrote:
>> mmap is used to establish address range for mshare region and map the
>> region into process's address space. Add basic mmap operation that
>> supports setting address range. Also fix code to not allocate new
>> mm_struct for files in msharefs that exist for information and not
>> for defining a new mshare region.
>>
>> Signed-off-by: Khalid Aziz <[email protected]>
>> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
>> ---
>> mm/mshare.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 41 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/mshare.c b/mm/mshare.c
>> index d238b68b0576..088a6cab1e93 100644
>> --- a/mm/mshare.c
>> +++ b/mm/mshare.c
>> @@ -9,7 +9,8 @@
>> *
>> *
>> * Copyright (C) 2022 Oracle Corp. All rights reserved.
>> - * Author: Khalid Aziz <[email protected]>
>> + * Authors: Khalid Aziz <[email protected]>
>> + * Matthew Wilcox <[email protected]>
>> *
>> */
>>
>> @@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov)
>> return ret;
>> }
>>
>> +static int
>> +msharefs_mmap(struct file *file, struct vm_area_struct *vma)
>> +{
>> + struct mshare_data *info = file->private_data;
>> + struct mm_struct *mm = info->mm;
>> +
>> + /*
>> + * If this mshare region has been set up once already, bail out
>> + */
>> + if (mm->mmap_base != 0)
>> + return -EINVAL;
>> +
>> + if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1))
>> + return -EINVAL;
>> +
>> + mm->mmap_base = vma->vm_start;
>> + mm->task_size = vma->vm_end - vma->vm_start;
>> + if (!mm->task_size)
>> + mm->task_size--;
>> + info->minfo->start = mm->mmap_base;
>> + info->minfo->size = mm->task_size;
>
> So, uh, if the second mmap() caller decides to ignore the mshare_info,
> should they get an -EINVAL here since the memory mappings won't be at
> the same process virtual address?

Yes, that is in patch 9. A second mmap will result in EINVAL until patch 9 irrespective of address and size passed to mmap.

>
>> + vma->vm_flags |= VM_SHARED_PT;
>> + vma->vm_private_data = info;
>> + return 0;
>> +}
>> +
>> static const struct file_operations msharefs_file_operations = {
>> .open = msharefs_open,
>> .read_iter = msharefs_read,
>> + .mmap = msharefs_mmap,
>> .llseek = no_llseek,
>> };
>>
>> @@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode)
>> goto err_free;
>> }
>> info->mm = mm;
>> - info->minfo = NULL;
>> + info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL);
>> + if (info->minfo == NULL) {
>> + retval = -ENOMEM;
>> + goto err_free;
>> + }
>> +
>> refcount_set(&info->refcnt, 1);
>> inode->i_private = info;
>>
>> @@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode)
>> err_free:
>> if (mm)
>> mmput(mm);
>> + kfree(info->minfo);
>> kfree(info);
>> return retval;
>> }
>>
>> static struct inode
>> *msharefs_get_inode(struct super_block *sb, const struct inode *dir,
>> - umode_t mode)
>> + umode_t mode, bool newmm)
>> {
>> struct inode *inode = new_inode(sb);
>> if (inode) {
>> @@ -147,7 +181,7 @@ static struct inode
>> case S_IFREG:
>> inode->i_op = &msharefs_file_inode_ops;
>> inode->i_fop = &msharefs_file_operations;
>> - if (msharefs_fill_mm(inode) != 0) {
>> + if (newmm && msharefs_fill_mm(inode) != 0) {
>> discard_new_inode(inode);
>> inode = ERR_PTR(-ENOMEM);
>> }
>> @@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>> struct inode *inode;
>> int err = 0;
>>
>> - inode = msharefs_get_inode(dir->i_sb, dir, mode);
>> + inode = msharefs_get_inode(dir->i_sb, dir, mode, true);
>> if (IS_ERR(inode))
>> return PTR_ERR(inode);
>>
>> @@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir,
>> if (!dentry)
>> return -ENOMEM;
>>
>> - inode = msharefs_get_inode(s, dir, S_IFREG | files->mode);
>> + inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false);
>
> I was wondering why the information files were getting their own
> mshare_data.
>
> TBH I'm not really sure what the difference is between mshare_data and
> mshare_info, since those names are not especially distinct.

mshare_data is superset and internal while mshare_info is what is sent back to userspace when it reads a file
representing an mshare region.

>
>> if (!inode) {
>> dput(dentry);
>> return -ENOMEM;
>> @@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
>> sb->s_d_op = &msharefs_d_ops;
>> sb->s_time_gran = 1;
>>
>> - inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777);
>> + inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false);
>
> Is it wise to default to world-writable? Surely whatever userspace
> software wraps an msharefs can relax permissions as needed.
>

Since this is for the root inode, the default is so any process can create mshare region in msharefs which I think is
most flexible. Individual userspace app can create mshare regions with any permissions they deem fit using open(). Does
that make sense?

Thanks,
Khalid