This patch series is in two parts:-
1. Currently there are a number of places in the kernel where we assume
VM_SHARED implies that a mapping is writable. Let's be slightly less
strict and relax this restriction in the case that VM_MAYWRITE is not
set.
This should have no noticeable impact as the lack of VM_MAYWRITE implies
that the mapping can not be made writable via mprotect() or any other
means.
2. Align the behaviour of F_SEAL_WRITE and F_SEAL_FUTURE_WRITE on mmap().
The latter already clears the VM_MAYWRITE flag for a sealed read-only
mapping, we simply extend this to F_SEAL_WRITE too.
For this to have effect, we must also invoke call_mmap() before
mapping_map_writable().
As this is quite a fundamental change on the assumptions around VM_SHARED
and since this causes a visible change to userland (in permitting read-only
shared mappings on F_SEAL_WRITE mappings), I am putting forward as an RFC
to see if there is anything terribly wrong with it.
I suspect even if the patch series as a whole is unpalatable, there are
probably things we can salvage from it in any case.
Thanks to Andy Lutomirski who inspired the series!
Lorenzo Stoakes (3):
mm: drop the assumption that VM_SHARED always implies writable
mm: update seal_check_[future_]write() to include F_SEAL_WRITE as well
mm: perform the mapping_map_writable() check after call_mmap()
fs/hugetlbfs/inode.c | 2 +-
include/linux/fs.h | 4 ++--
include/linux/mm.h | 24 ++++++++++++++++++------
kernel/fork.c | 2 +-
mm/filemap.c | 2 +-
mm/madvise.c | 2 +-
mm/mmap.c | 22 +++++++++++-----------
mm/shmem.c | 2 +-
8 files changed, 36 insertions(+), 24 deletions(-)
--
2.40.0
There are places in the kernel where there is an implicit assumption that
VM_SHARED VMAs must either be writable or might become writable via
e.g. mprotect().
We can explicitly check for the writable, shared case while remaining
conservative - If VM_MAYWRITE is not set then, by definition, the memory
can never be written to.
Update these checks to also check for VM_MAYWRITE.
Suggested-by: Andy Lutomirski <[email protected]>
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
include/linux/fs.h | 4 ++--
include/linux/mm.h | 11 +++++++++++
kernel/fork.c | 2 +-
mm/filemap.c | 2 +-
mm/madvise.c | 2 +-
mm/mmap.c | 12 ++++++------
6 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..373e1edd719c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -410,7 +410,7 @@ extern const struct address_space_operations empty_aops;
* It is also used to block modification of page cache contents through
* memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -513,7 +513,7 @@ static inline int mapping_mapped(struct address_space *mapping)
/*
* Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
* marks vma as VM_SHARED if it is shared, and the file was opened for
* writing i.e. vma may be mprotected writable even if now readonly.
*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 898ece0a3802..8e64041b1703 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -862,6 +862,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+ (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+ return is_shared_maywrite(vma->vm_flags);
+}
+
static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 2066a57786a8..58f257d60fee 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
get_file(file);
i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(tmp))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
diff --git a/mm/filemap.c b/mm/filemap.c
index a34abfe8c654..4d896515032c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3607,7 +3607,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if (vma_is_shared_maywrite(vma))
return -EINVAL;
return generic_file_mmap(file, vma);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 340125d08c03..606c395c4ddd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -981,7 +981,7 @@ static long madvise_remove(struct vm_area_struct *vma,
return -EINVAL;
}
- if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+ if (!vma_is_shared_maywrite(vma))
return -EACCES;
offset = (loff_t)(start - vma->vm_start)
diff --git a/mm/mmap.c b/mm/mmap.c
index 51cd747884e3..c96dcce90772 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -106,7 +106,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
@@ -427,7 +427,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
static void __vma_link_file(struct vm_area_struct *vma,
struct address_space *mapping)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
@@ -2596,7 +2596,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma->vm_pgoff = pgoff;
if (file) {
- if (vm_flags & VM_SHARED) {
+ if (is_shared_maywrite(vm_flags)) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto free_vma;
@@ -2671,7 +2671,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_allow_writable(vma->vm_file->f_mapping);
flush_dcache_mmap_lock(vma->vm_file->f_mapping);
@@ -2688,7 +2688,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
- if (file && vm_flags & VM_SHARED)
+ if (file && is_shared_maywrite(vm_flags))
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
expanded:
@@ -2734,7 +2734,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start,
vma->vm_end, true);
}
- if (file && (vm_flags & VM_SHARED))
+ if (file && is_shared_maywrite(vm_flags))
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
--
2.40.0
In order for a F_SEAL_WRITE sealed memfd mapping to have an opportunity to
clear VM_MAYWRITE, we must be able to invoke the appropriate vm_ops->mmap()
handler to do so. We would otherwise fail the mapping_map_writable() check
before we had the opportunity to avoid it.
This patch moves this check after the call_mmap() invocation. Only memfd
actively denies write access causing a potential failure here (in
memfd_add_seals()), so there should be no impact on non-memfd cases.
This patch makes the userland-visible change that MAP_SHARED, PROT_READ
mappings of an F_SEAL_WRITE sealed memfd mapping will now succeed.
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
mm/mmap.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index c96dcce90772..a166e9f3c474 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2596,17 +2596,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma->vm_pgoff = pgoff;
if (file) {
- if (is_shared_maywrite(vm_flags)) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if (vma_is_shared_maywrite(vma)) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto unmap_and_free_vma;
+ }
+
/*
* Expansion is handled above, merging is handled below.
* Drivers should not alter the address of the VMA.
--
2.40.0
Check for F_SEAL_WRITE as well for which the precise same logic can
reasonably be applied, however so far this code will simply not be run as
the mapping_map_writable() call occurs before shmem_mmap() or
hugetlbfs_file_mmap() and thus would error out in the case of a read-only
shared mapping before the logic could be applied.
This therefore has no impact until the following patch which changes the
order in which the *_mmap() functions are called.
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
fs/hugetlbfs/inode.c | 2 +-
include/linux/mm.h | 13 +++++++------
mm/shmem.c | 2 +-
3 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 702d79639c0d..8ab8840707ac 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -135,7 +135,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
vma->vm_ops = &hugetlb_vm_ops;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8e64041b1703..ddf1b35b9dbb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3775,16 +3775,17 @@ static inline void mem_dump_obj(void *object) {}
#endif
/**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flag and
+ * handle it.
* @seals: the seals to check
* @vma: the vma to operate on
*
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags. Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
*/
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
- if (seals & F_SEAL_FUTURE_WRITE) {
+ if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
/*
* New PROT_WRITE and MAP_SHARED mmaps are not allowed when
* "future write" seal active.
@@ -3793,7 +3794,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
return -EPERM;
/*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
* MAP_SHARED and read-only, take care to not allow mprotect to
* revert protections on such mappings. Do this only for shared
* mappings. For private mappings, don't need to mask
diff --git a/mm/shmem.c b/mm/shmem.c
index 9218c955f482..863f2ff9fab8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2313,7 +2313,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
struct shmem_inode_info *info = SHMEM_I(inode);
int ret;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
--
2.40.0
Hi!
On Mon 03-04-23 23:28:29, Lorenzo Stoakes wrote:
> This patch series is in two parts:-
>
> 1. Currently there are a number of places in the kernel where we assume
> VM_SHARED implies that a mapping is writable. Let's be slightly less
> strict and relax this restriction in the case that VM_MAYWRITE is not
> set.
>
> This should have no noticeable impact as the lack of VM_MAYWRITE implies
> that the mapping can not be made writable via mprotect() or any other
> means.
>
> 2. Align the behaviour of F_SEAL_WRITE and F_SEAL_FUTURE_WRITE on mmap().
> The latter already clears the VM_MAYWRITE flag for a sealed read-only
> mapping, we simply extend this to F_SEAL_WRITE too.
>
> For this to have effect, we must also invoke call_mmap() before
> mapping_map_writable().
>
> As this is quite a fundamental change on the assumptions around VM_SHARED
> and since this causes a visible change to userland (in permitting read-only
> shared mappings on F_SEAL_WRITE mappings), I am putting forward as an RFC
> to see if there is anything terribly wrong with it.
So what I miss in this series is what the motivation is. Is it that you need
to map F_SEAL_WRITE read-only? Why?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon 03-04-23 23:28:32, Lorenzo Stoakes wrote:
> In order for a F_SEAL_WRITE sealed memfd mapping to have an opportunity to
> clear VM_MAYWRITE, we must be able to invoke the appropriate vm_ops->mmap()
> handler to do so. We would otherwise fail the mapping_map_writable() check
> before we had the opportunity to avoid it.
>
> This patch moves this check after the call_mmap() invocation. Only memfd
> actively denies write access causing a potential failure here (in
> memfd_add_seals()), so there should be no impact on non-memfd cases.
>
> This patch makes the userland-visible change that MAP_SHARED, PROT_READ
> mappings of an F_SEAL_WRITE sealed memfd mapping will now succeed.
>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
> mm/mmap.c | 12 ++++++------
> 1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index c96dcce90772..a166e9f3c474 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2596,17 +2596,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vma->vm_pgoff = pgoff;
>
> if (file) {
> - if (is_shared_maywrite(vm_flags)) {
> - error = mapping_map_writable(file->f_mapping);
> - if (error)
> - goto free_vma;
> - }
> -
> vma->vm_file = get_file(file);
> error = call_mmap(file, vma);
> if (error)
> goto unmap_and_free_vma;
>
> + if (vma_is_shared_maywrite(vma)) {
> + error = mapping_map_writable(file->f_mapping);
> + if (error)
> + goto unmap_and_free_vma;
Shouldn't we rather jump to close_and_free_vma?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Fri, Apr 21, 2023 at 11:06:28AM +0200, Jan Kara wrote:
> On Mon 03-04-23 23:28:32, Lorenzo Stoakes wrote:
> > In order for a F_SEAL_WRITE sealed memfd mapping to have an opportunity to
> > clear VM_MAYWRITE, we must be able to invoke the appropriate vm_ops->mmap()
> > handler to do so. We would otherwise fail the mapping_map_writable() check
> > before we had the opportunity to avoid it.
> >
> > This patch moves this check after the call_mmap() invocation. Only memfd
> > actively denies write access causing a potential failure here (in
> > memfd_add_seals()), so there should be no impact on non-memfd cases.
> >
> > This patch makes the userland-visible change that MAP_SHARED, PROT_READ
> > mappings of an F_SEAL_WRITE sealed memfd mapping will now succeed.
> >
> > Signed-off-by: Lorenzo Stoakes <[email protected]>
> > ---
> > mm/mmap.c | 12 ++++++------
> > 1 file changed, 6 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index c96dcce90772..a166e9f3c474 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -2596,17 +2596,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vma->vm_pgoff = pgoff;
> >
> > if (file) {
> > - if (is_shared_maywrite(vm_flags)) {
> > - error = mapping_map_writable(file->f_mapping);
> > - if (error)
> > - goto free_vma;
> > - }
> > -
> > vma->vm_file = get_file(file);
> > error = call_mmap(file, vma);
> > if (error)
> > goto unmap_and_free_vma;
> >
> > + if (vma_is_shared_maywrite(vma)) {
> > + error = mapping_map_writable(file->f_mapping);
> > + if (error)
> > + goto unmap_and_free_vma;
>
> Shouldn't we rather jump to close_and_free_vma?
You're right, we may need to call vma->vm_ops->close() to match the
->mmap().
>
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
On Fri, Apr 21, 2023 at 11:01:26AM +0200, Jan Kara wrote:
> Hi!
>
> On Mon 03-04-23 23:28:29, Lorenzo Stoakes wrote:
> > This patch series is in two parts:-
> >
> > 1. Currently there are a number of places in the kernel where we assume
> > VM_SHARED implies that a mapping is writable. Let's be slightly less
> > strict and relax this restriction in the case that VM_MAYWRITE is not
> > set.
> >
> > This should have no noticeable impact as the lack of VM_MAYWRITE implies
> > that the mapping can not be made writable via mprotect() or any other
> > means.
> >
> > 2. Align the behaviour of F_SEAL_WRITE and F_SEAL_FUTURE_WRITE on mmap().
> > The latter already clears the VM_MAYWRITE flag for a sealed read-only
> > mapping, we simply extend this to F_SEAL_WRITE too.
> >
> > For this to have effect, we must also invoke call_mmap() before
> > mapping_map_writable().
> >
> > As this is quite a fundamental change on the assumptions around VM_SHARED
> > and since this causes a visible change to userland (in permitting read-only
> > shared mappings on F_SEAL_WRITE mappings), I am putting forward as an RFC
> > to see if there is anything terribly wrong with it.
>
> So what I miss in this series is what the motivation is. Is it that you need
> to map F_SEAL_WRITE read-only? Why?
>
This originated from the discussion in [1], which refers to the bug
reported in [2]. Essentially the user is write-sealing a memfd then trying
to mmap it read-only, but receives an -EPERM error.
F_SEAL_FUTURE_WRITE _does_ explicitly permit this but F_SEAL_WRITE does not.
The fcntl() man page states:
Furthermore, trying to create new shared, writable memory-mappings via
mmap(2) will also fail with EPERM.
So the kernel does not behave as the documentation states.
I took the user-supplied repro and slightly modified it, enclosed
below. After this patch series, this code works correctly.
I think there's definitely a case for the VM_MAYWRITE part of this patch
series even if the memfd bits are not considered useful, as we do seem to
make the implicit assumption that MAP_SHARED == writable even if
!VM_MAYWRITE which seems odd.
Reproducer:-
int main()
{
int fd = memfd_create("test", MFD_ALLOW_SEALING);
if (fd == -1) {
perror("memfd_create");
return EXIT_FAILURE;
}
write(fd, "test", 4);
if (fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) == -1) {
perror("fcntl");
return EXIT_FAILURE;
}
void *ret = mmap(NULL, 4, PROT_READ, MAP_SHARED, fd, 0);
if (ret == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
[1]:https://lore.kernel.org/all/[email protected]/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
On Fri 21-04-23 22:23:12, Lorenzo Stoakes wrote:
> On Fri, Apr 21, 2023 at 11:01:26AM +0200, Jan Kara wrote:
> > Hi!
> >
> > On Mon 03-04-23 23:28:29, Lorenzo Stoakes wrote:
> > > This patch series is in two parts:-
> > >
> > > 1. Currently there are a number of places in the kernel where we assume
> > > VM_SHARED implies that a mapping is writable. Let's be slightly less
> > > strict and relax this restriction in the case that VM_MAYWRITE is not
> > > set.
> > >
> > > This should have no noticeable impact as the lack of VM_MAYWRITE implies
> > > that the mapping can not be made writable via mprotect() or any other
> > > means.
> > >
> > > 2. Align the behaviour of F_SEAL_WRITE and F_SEAL_FUTURE_WRITE on mmap().
> > > The latter already clears the VM_MAYWRITE flag for a sealed read-only
> > > mapping, we simply extend this to F_SEAL_WRITE too.
> > >
> > > For this to have effect, we must also invoke call_mmap() before
> > > mapping_map_writable().
> > >
> > > As this is quite a fundamental change on the assumptions around VM_SHARED
> > > and since this causes a visible change to userland (in permitting read-only
> > > shared mappings on F_SEAL_WRITE mappings), I am putting forward as an RFC
> > > to see if there is anything terribly wrong with it.
> >
> > So what I miss in this series is what the motivation is. Is it that you need
> > to map F_SEAL_WRITE read-only? Why?
> >
>
> This originated from the discussion in [1], which refers to the bug
> reported in [2]. Essentially the user is write-sealing a memfd then trying
> to mmap it read-only, but receives an -EPERM error.
>
> F_SEAL_FUTURE_WRITE _does_ explicitly permit this but F_SEAL_WRITE does not.
>
> The fcntl() man page states:
>
> Furthermore, trying to create new shared, writable memory-mappings via
> mmap(2) will also fail with EPERM.
>
> So the kernel does not behave as the documentation states.
>
> I took the user-supplied repro and slightly modified it, enclosed
> below. After this patch series, this code works correctly.
>
> I think there's definitely a case for the VM_MAYWRITE part of this patch
> series even if the memfd bits are not considered useful, as we do seem to
> make the implicit assumption that MAP_SHARED == writable even if
> !VM_MAYWRITE which seems odd.
Thanks for the explanation! Could you please include this information in
the cover letter (perhaps in a form of a short note and reference to the
mailing list) for future reference? Thanks!
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, Apr 24, 2023 at 02:19:36PM +0200, Jan Kara wrote:
> On Fri 21-04-23 22:23:12, Lorenzo Stoakes wrote:
> > On Fri, Apr 21, 2023 at 11:01:26AM +0200, Jan Kara wrote:
> > > Hi!
> > >
> > > On Mon 03-04-23 23:28:29, Lorenzo Stoakes wrote:
> > > > This patch series is in two parts:-
> > > >
> > > > 1. Currently there are a number of places in the kernel where we assume
> > > > VM_SHARED implies that a mapping is writable. Let's be slightly less
> > > > strict and relax this restriction in the case that VM_MAYWRITE is not
> > > > set.
> > > >
> > > > This should have no noticeable impact as the lack of VM_MAYWRITE implies
> > > > that the mapping can not be made writable via mprotect() or any other
> > > > means.
> > > >
> > > > 2. Align the behaviour of F_SEAL_WRITE and F_SEAL_FUTURE_WRITE on mmap().
> > > > The latter already clears the VM_MAYWRITE flag for a sealed read-only
> > > > mapping, we simply extend this to F_SEAL_WRITE too.
> > > >
> > > > For this to have effect, we must also invoke call_mmap() before
> > > > mapping_map_writable().
> > > >
> > > > As this is quite a fundamental change on the assumptions around VM_SHARED
> > > > and since this causes a visible change to userland (in permitting read-only
> > > > shared mappings on F_SEAL_WRITE mappings), I am putting forward as an RFC
> > > > to see if there is anything terribly wrong with it.
> > >
> > > So what I miss in this series is what the motivation is. Is it that you need
> > > to map F_SEAL_WRITE read-only? Why?
> > >
> >
> > This originated from the discussion in [1], which refers to the bug
> > reported in [2]. Essentially the user is write-sealing a memfd then trying
> > to mmap it read-only, but receives an -EPERM error.
> >
> > F_SEAL_FUTURE_WRITE _does_ explicitly permit this but F_SEAL_WRITE does not.
> >
> > The fcntl() man page states:
> >
> > Furthermore, trying to create new shared, writable memory-mappings via
> > mmap(2) will also fail with EPERM.
> >
> > So the kernel does not behave as the documentation states.
> >
> > I took the user-supplied repro and slightly modified it, enclosed
> > below. After this patch series, this code works correctly.
> >
> > I think there's definitely a case for the VM_MAYWRITE part of this patch
> > series even if the memfd bits are not considered useful, as we do seem to
> > make the implicit assumption that MAP_SHARED == writable even if
> > !VM_MAYWRITE which seems odd.
>
> Thanks for the explanation! Could you please include this information in
> the cover letter (perhaps in a form of a short note and reference to the
> mailing list) for future reference? Thanks!
>
> Honza
>
Sure, apologies for not being clear about that :)
I may respin this as a non-RFC (with updated description of course) as its
received very little attention as an RFC and I don't think it's so
insane/huge a concept as to warrant remaining one.
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR