These 2 patches are based on the discussion of "Sealed memfd & no-fault mmap"
at https://bit.ly/3pdwOGR
patch 1: make "vm_flags" be an u64, so we can have enough bits on 32-bit
architectures. Then we can add VM_NOSIGBUS which is bit 38.
patch 2: support no-fault mmap for shmem read
Ming Lin (2):
mm: make "vm_flags" be an u64
mm: adds NOSIGBUS extension for out-of-band shmem read
arch/arm64/Kconfig | 1 -
arch/powerpc/Kconfig | 1 -
arch/x86/Kconfig | 1 -
include/linux/mm.h | 102 ++++++++++++++++-----------------
include/linux/mm_types.h | 4 +-
include/linux/mman.h | 5 +-
include/uapi/asm-generic/mman-common.h | 1 +
mm/Kconfig | 2 -
mm/memory.c | 2 +-
mm/mmap.c | 5 +-
mm/shmem.c | 17 +++++-
11 files changed, 76 insertions(+), 65 deletions(-)
--
1.8.3.1
Adds new flag MAP_NOSIGBUS of mmap() to specify the behavior of
"don't SIGBUS on read beyond i_size". This flag is only allowed
for read only shmem mapping.
If you use MAP_NOSIGBUS, and you access pages that don't have a backing
store, you will get zero pages, and they will NOT BE SYNCHRONIZED with
the backing store possibly later being updated.
Any user that uses MAP_NOSIGBUS had better just accept that it's not
compatible with expanding the shmem backing store later.
Signed-off-by: Ming Lin <[email protected]>
---
include/linux/mm.h | 2 ++
include/linux/mman.h | 1 +
include/uapi/asm-generic/mman-common.h | 1 +
mm/mmap.c | 3 +++
mm/shmem.c | 17 ++++++++++++++++-
5 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..5d0e0dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#define VM_NOSIGBUS VM_FLAGS_BIT(38) /* Do not SIGBUS on out-of-band shmem read */
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
+ _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
arch_calc_vm_flag_bits(flags);
}
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..55f4be0 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
#define MAP_HUGETLB 0x040000 /* create a huge page mapping */
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS 0x200000 /* do not SIGBUS on out-of-band shmem read */
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..69cd856 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!len)
return -EINVAL;
+ if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
+ return -EINVAL;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d46611..5d15b08 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- return -EINVAL;
+ if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_MIXEDMAP;
+ /*
+ * Get zero page for MAP_NOSIGBUS mapping, which isn't
+ * coherent wrt shmem contents that are expanded and
+ * filled in later.
+ */
+ error = vm_insert_page(vma, (unsigned long)vmf->address,
+ ZERO_PAGE(0));
+ if (error)
+ return error;
+
+ *fault_type = VM_FAULT_NOPAGE;
+ return 0;
}
sbinfo = SHMEM_SB(inode->i_sb);
--
1.8.3.1
So we can have enough bits on 32-bit architectures.
Signed-off-by: Ming Lin <[email protected]>
---
arch/arm64/Kconfig | 1 -
arch/powerpc/Kconfig | 1 -
arch/x86/Kconfig | 1 -
include/linux/mm.h | 100 ++++++++++++++++++++++-------------------------
include/linux/mm_types.h | 4 +-
include/linux/mman.h | 4 +-
mm/Kconfig | 2 -
mm/memory.c | 2 +-
mm/mmap.c | 2 +-
9 files changed, 53 insertions(+), 64 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d856..c6960ea 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1658,7 +1658,6 @@ config ARM64_MTE
depends on AS_HAS_LSE_ATOMICS
# Required for tag checking in the uaccess routines
depends on ARM64_PAN
- select ARCH_USES_HIGH_VMA_FLAGS
help
Memory Tagging (part of the ARMv8.5 Extensions) provides
architectural support for run-time, always-on detection of
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2a..5c1b49e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -940,7 +940,6 @@ config PPC_MEM_KEYS
prompt "PowerPC Memory Protection Keys"
def_bool y
depends on PPC_BOOK3S_64
- select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
help
Memory Protection Keys provides a mechanism for enforcing
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b..a885336 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1874,7 +1874,6 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
def_bool y
# Note: only available in 64-bit mode
depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD)
- select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
help
Memory Protection Keys provides a mechanism for enforcing
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75..e9d67bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -264,73 +264,68 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
extern unsigned int kobjsize(const void *objp);
#endif
+#define VM_FLAGS_BIT(N) (1ULL << (N))
+
/*
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
*/
#define VM_NONE 0x00000000
-#define VM_READ 0x00000001 /* currently active flags */
-#define VM_WRITE 0x00000002
-#define VM_EXEC 0x00000004
-#define VM_SHARED 0x00000008
+#define VM_READ VM_FLAGS_BIT(0) /* currently active flags */
+#define VM_WRITE VM_FLAGS_BIT(1)
+#define VM_EXEC VM_FLAGS_BIT(2)
+#define VM_SHARED VM_FLAGS_BIT(3)
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
-#define VM_MAYWRITE 0x00000020
-#define VM_MAYEXEC 0x00000040
-#define VM_MAYSHARE 0x00000080
-
-#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
-#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
-#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
-#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
-#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
-
-#define VM_LOCKED 0x00002000
-#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
-
- /* Used by sys_madvise() */
-#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
-#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
-#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
-#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
-#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
-#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
-#define VM_SYNC 0x00800000 /* Synchronous page faults */
-#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
-#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
-#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
+#define VM_MAYREAD VM_FLAGS_BIT(4) /* limits for mprotect() etc */
+#define VM_MAYWRITE VM_FLAGS_BIT(5)
+#define VM_MAYEXEC VM_FLAGS_BIT(6)
+#define VM_MAYSHARE VM_FLAGS_BIT(7)
+
+#define VM_GROWSDOWN VM_FLAGS_BIT(8) /* general info on the segment */
+#define VM_UFFD_MISSING VM_FLAGS_BIT(9) /* missing pages tracking */
+#define VM_PFNMAP VM_FLAGS_BIT(10) /* Page-ranges managed without "struct page", just pure PFN */
+#define VM_DENYWRITE VM_FLAGS_BIT(11) /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP VM_FLAGS_BIT(12) /* wrprotect pages tracking */
+
+#define VM_LOCKED VM_FLAGS_BIT(13)
+#define VM_IO VM_FLAGS_BIT(14) /* Memory mapped I/O or similar */
+
+ /* Used by sys_madvise() */
+#define VM_SEQ_READ VM_FLAGS_BIT(15) /* App will access data sequentially */
+#define VM_RAND_READ VM_FLAGS_BIT(16) /* App will not benefit from clustered reads */
+
+#define VM_DONTCOPY VM_FLAGS_BIT(17) /* Do not copy this vma on fork */
+#define VM_DONTEXPAND VM_FLAGS_BIT(18) /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT VM_FLAGS_BIT(19) /* Lock the pages covered when they are faulted in */
+#define VM_ACCOUNT VM_FLAGS_BIT(20) /* Is a VM accounted object */
+#define VM_NORESERVE VM_FLAGS_BIT(21) /* should the VM suppress accounting */
+#define VM_HUGETLB VM_FLAGS_BIT(22) /* Huge TLB Page VM */
+#define VM_SYNC VM_FLAGS_BIT(23) /* Synchronous page faults */
+#define VM_ARCH_1 VM_FLAGS_BIT(24) /* Architecture-specific flag */
+#define VM_WIPEONFORK VM_FLAGS_BIT(25) /* Wipe VMA contents in child. */
+#define VM_DONTDUMP VM_FLAGS_BIT(26) /* Do not include in the core dump */
#ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
+# define VM_SOFTDIRTY VM_FLAGS_BIT(27) /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY 0
#endif
-#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
+#define VM_MIXEDMAP VM_FLAGS_BIT(28) /* Can contain "struct page" and pure PFN pages */
+#define VM_HUGEPAGE VM_FLAGS_BIT(29) /* MADV_HUGEPAGE marked this vma */
+#define VM_NOHUGEPAGE VM_FLAGS_BIT(30) /* MADV_NOHUGEPAGE marked this vma */
+#define VM_MERGEABLE VM_FLAGS_BIT(31) /* KSM may merge identical pages */
+
+#define VM_HIGH_ARCH_0 VM_FLAGS_BIT(32)
+#define VM_HIGH_ARCH_1 VM_FLAGS_BIT(33)
+#define VM_HIGH_ARCH_2 VM_FLAGS_BIT(34)
+#define VM_HIGH_ARCH_3 VM_FLAGS_BIT(35)
+#define VM_HIGH_ARCH_4 VM_FLAGS_BIT(36)
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
+# define VM_PKEY_SHIFT 32
# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */
# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
@@ -373,8 +368,7 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 37
-# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
+# define VM_UFFD_MINOR VM_FLAGS_BIT(37) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c..5347293 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -264,7 +264,7 @@ struct page_frag_cache {
bool pfmemalloc;
};
-typedef unsigned long vm_flags_t;
+typedef u64 vm_flags_t;
/*
* A region containing a mapping of a non-memory backed file under NOMMU
@@ -330,7 +330,7 @@ struct vm_area_struct {
* See vmf_insert_mixed_prot() for discussion.
*/
pgprot_t vm_page_prot;
- unsigned long vm_flags; /* Flags, see mm.h. */
+ vm_flags_t vm_flags; /* Flags, see mm.h. */
/*
* For areas with an address space and backing store,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 629cefc..b2cbae9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -135,7 +135,7 @@ static inline bool arch_validate_flags(unsigned long flags)
/*
* Combine the mmap "prot" argument into "vm_flags" used internally.
*/
-static inline unsigned long
+static inline vm_flags_t
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
@@ -147,7 +147,7 @@ static inline bool arch_validate_flags(unsigned long flags)
/*
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
-static inline unsigned long
+static inline vm_flags_t
calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
diff --git a/mm/Kconfig b/mm/Kconfig
index 02d44e3..aa8efba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -830,8 +830,6 @@ config DEVICE_PRIVATE
config VMAP_PFN
bool
-config ARCH_USES_HIGH_VMA_FLAGS
- bool
config ARCH_HAS_PKEYS
bool
diff --git a/mm/memory.c b/mm/memory.c
index 730daa0..eff2a47 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -550,7 +550,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
+ pr_alert("addr:%px vm_flags:%08llx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
diff --git a/mm/mmap.c b/mm/mmap.c
index 0584e54..096bba4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1353,7 +1353,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
}
static inline int mlock_future_check(struct mm_struct *mm,
- unsigned long flags,
+ vm_flags_t flags,
unsigned long len)
{
unsigned long locked, lock_limit;
--
1.8.3.1
On 6/1/2021 5:16 PM, Linus Torvalds wrote:
> This series passes my "looks fine, is simple and straightforward" test.
>
> One nit:
>
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <[email protected]> wrote:
>>
>> + error = vm_insert_page(vma, (unsigned long)vmf->address,
>> + ZERO_PAGE(0));
>
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
>
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.
grep -Rn ZERO_PAGE linux/arch/ | grep define
s390 and mips do use the "address" of ZERO_PAGE(address)
Fixed.
On Tue, 1 Jun 2021, Linus Torvalds wrote:
> This series passes my "looks fine, is simple and straightforward" test.
I'm sorry, but it also passes my "hack that we do not want in shmem.c"
test. I'll say more in response to the preceding mail.
Hugh
>
> One nit:
>
> On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <[email protected]> wrote:
> >
> > + error = vm_insert_page(vma, (unsigned long)vmf->address,
> > + ZERO_PAGE(0));
>
> On architectures where this matters - bad virtual caches - it would be
> better to use ZERO_PAGE(vmf->address).
>
> It doesn't make a difference on any sane architecture, but it's the
> RightThing(tm) to do.
>
> Linus
>
This series passes my "looks fine, is simple and straightforward" test.
One nit:
On Tue, Jun 1, 2021 at 1:22 PM Ming Lin <[email protected]> wrote:
>
> + error = vm_insert_page(vma, (unsigned long)vmf->address,
> + ZERO_PAGE(0));
On architectures where this matters - bad virtual caches - it would be
better to use ZERO_PAGE(vmf->address).
It doesn't make a difference on any sane architecture, but it's the
RightThing(tm) to do.
Linus
Hi Ming,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linux/master]
[also build test ERROR on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: parisc-randconfig-r015-20210601 (attached as .config)
compiler: hppa-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/c14d1ac79e68e85a2ff97e19c36100990b09a7c3
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
git checkout c14d1ac79e68e85a2ff97e19c36100990b09a7c3
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=parisc
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from mm/filemap.c:24:
include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
--
In file included from mm/util.c:15:
include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
mm/util.c: In function 'page_mapping':
mm/util.c:700:15: warning: variable 'entry' set but not used [-Wunused-but-set-variable]
700 | swp_entry_t entry;
| ^~~~~
--
In file included from mm/mmap.c:18:
include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
mm/mmap.c: In function 'do_mmap':
>> mm/mmap.c:1422:15: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
1422 | if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
| ^~~~~~~~~~~~
| VM_NOSIGBUS
In file included from mm/mmap.c:18:
include/linux/mman.h: In function 'calc_vm_flag_bits':
include/linux/mman.h:159:1: error: control reaches end of non-void function [-Werror=return-type]
159 | }
| ^
cc1: some warnings being treated as errors
--
In file included from drivers/char/mem.c:16:
include/linux/mman.h: In function 'calc_vm_flag_bits':
>> include/linux/mman.h:157:31: error: 'MAP_NOSIGBUS' undeclared (first use in this function); did you mean 'VM_NOSIGBUS'?
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
include/linux/mman.h:157:31: note: each undeclared identifier is reported only once for each function it appears in
157 | _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
| ^~~~~~~~~~~~
include/linux/mman.h:131:7: note: in definition of macro '_calc_vm_trans'
131 | ((!(bit1) || !(bit2)) ? 0 : \
| ^~~~
drivers/char/mem.c: At top level:
drivers/char/mem.c:95:29: warning: no previous prototype for 'unxlate_dev_mem_ptr' [-Wmissing-prototypes]
95 | #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
| ^~~~~~~~~~~~~~~~~~~
drivers/char/mem.c:96:13: note: in expansion of macro 'unxlate_dev_mem_ptr'
96 | void __weak unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
| ^~~~~~~~~~~~~~~~~~~
vim +157 include/linux/mman.h
146
147 /*
148 * Combine the mmap "flags" argument into "vm_flags" used internally.
149 */
150 static inline vm_flags_t
151 calc_vm_flag_bits(unsigned long flags)
152 {
153 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
154 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
155 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
156 _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
> 157 _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
158 arch_calc_vm_flag_bits(flags);
159 }
160
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
On Tue, 1 Jun 2021, Ming Lin wrote:
> Adds new flag MAP_NOSIGBUS of mmap() to specify the behavior of
> "don't SIGBUS on read beyond i_size". This flag is only allowed
> for read only shmem mapping.
>
> If you use MAP_NOSIGBUS, and you access pages that don't have a backing
> store, you will get zero pages, and they will NOT BE SYNCHRONIZED with
> the backing store possibly later being updated.
>
> Any user that uses MAP_NOSIGBUS had better just accept that it's not
> compatible with expanding the shmem backing store later.
>
> Signed-off-by: Ming Lin <[email protected]>
I disagree with Linus on this: I think it's a mistake,
and is being targeted at tmpfs to avoid wider scrutiny.
Though I have a more constructive suggestion under your mmap.c mod.
I've added linux-fsdevel and linux-api to the Cc list:
linux-api definitely needed to approve any MAP_NOSIGBUS semantics;
linux-fsdevel shouldn't be affected, but they need to know about it.
The prior discussion on "Sealed memfd & no-fault mmap" is at
https://lore.kernel.org/linux-mm/vs1Us2sm4qmfvLOqNat0-r16GyfmWzqUzQ4KHbXJwEcjhzeoQ4sBTxx7QXDG9B6zk5AeT7FsNb3CSr94LaKy6Novh1fbbw8D_BBxYsbPLms=@emersion.fr/
I've not yet seen a response from Simon Ser, as to whether this
kind of "opaque blob of zeroes" implementation would be of any
use to Wayland: you expected it to be a problem, and we shouldn't
waste any time on it if it's not going to be useful to someone.
Maybe there will be other takers (certainly SIGBUS is unpopular).
> ---
> include/linux/mm.h | 2 ++
> include/linux/mman.h | 1 +
> include/uapi/asm-generic/mman-common.h | 1 +
> mm/mmap.c | 3 +++
> mm/shmem.c | 17 ++++++++++++++++-
> 5 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e9d67bc..5d0e0dc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
> # define VM_UFFD_MINOR VM_NONE
> #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
>
> +#define VM_NOSIGBUS VM_FLAGS_BIT(38) /* Do not SIGBUS on out-of-band shmem read */
"out-of-band shmem read" means nothing to me: "Do not SIGBUS on fault".
> +
> /* Bits set in the VMA until the stack is in its final location */
> #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
>
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index b2cbae9..c966b08 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
> _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
> _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
> _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
> + _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
> arch_calc_vm_flag_bits(flags);
> }
>
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index f94f65d..55f4be0 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -29,6 +29,7 @@
> #define MAP_HUGETLB 0x040000 /* create a huge page mapping */
> #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
> #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
> +#define MAP_NOSIGBUS 0x200000 /* do not SIGBUS on out-of-band shmem read */
Ditto.
>
> #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
> * uninitialized */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 096bba4..69cd856 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> if (!len)
> return -EINVAL;
>
> + if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
> + return -EINVAL;
> +
No, for several reasons.
This has nothing to do with shmem really, that's just where this patch
hacks it in - and where you have a first user in mind. If this goes
forward, please modify mm/memory.c not mm/shmem.c, to make
VM_FAULT_SIGBUS on fault to VM_NOSIGBUS vma do the mapping of zero page.
(prot & PROT_WRITE) tells you about the mmap() flags, but says nothing
about what mprotect() could do later on. Look out for VM_SHARED and
VM_MAYSHARE and VM_MAYWRITE further down; and beware the else (!file)
block below them, shared anonymous would need more protection too.
Constructive comment: I guess much of my objection to this feature
comes from allowing it in the MAP_SHARED case. If you restrict it
to MAP_PRIVATE mapping of file, then it's less objectionable, and
you won't have to worry (so much?) about write protection. Copy
on write is normal there, and it's well established that subsequent
changes in the file will not be shared; you'd just be extending that
behaviour from writes to sigbusy reads.
And by restricting to MAP_PRIVATE, you would allow for adding a
proper MAP_SHARED implementation later, if it's thought useful
(that being the implementation which can subsequently unmap a
zero page to let new page cache be mapped).
> /*
> * Does the application expect PROT_READ to imply PROT_EXEC?
> *
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5d46611..5d15b08 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> repeat:
> if (sgp <= SGP_CACHE &&
> ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
> - return -EINVAL;
> + if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
> + return -EINVAL;
> +
> + vma->vm_flags |= VM_MIXEDMAP;
No. Presumably you hit the BUG_ON(mmap_read_trylock(vma->vm_mm))
in vm_insert_page(), so decided to modify the vm_flags here: no,
that BUG is saying you need mmap_write_lock() to write vm_flags.
And I have no idea of the ramifications of shmem in a VM_MIXEDMAP
vma; perhaps it works out fine, but I'd have to research that.
I'd rather not.
> + /*
> + * Get zero page for MAP_NOSIGBUS mapping, which isn't
> + * coherent wrt shmem contents that are expanded and
> + * filled in later.
> + */
> + error = vm_insert_page(vma, (unsigned long)vmf->address,
> + ZERO_PAGE(0));
> + if (error)
> + return error;
> +
> + *fault_type = VM_FAULT_NOPAGE;
> + return 0;
But there are other ways in which shmem_getpage_gfp() can fail and
shmem_fault() end up returning VM_FAULT_SIGBUS. Notably -ENOSPC.
It's trivial for someone to pass the MAP_NOSIGBUS user the fd of a
sparse file in a full filesystem, causing SIGBUS on access despite
MAP_NOSIGBUS. On shmem or some other filesystem.
I say the VM_FAULT_SIGBUS->map-in-zero-page handling should be back
in mm/memory.c, where it calls ->fault(): where others can review it.
One other thing while it crosses my mind. You'll need to decide
what truncating or hole-punching the file does to the zero pages
in its userspace mappings. I may turn out wrong, but I think you'll
find that truncation removes them, but hole-punch leaves them, and
ought to be modified to remove them too (it's a matter of how the
"even_cows" arg to unmap_mapping_range() is treated).
Hugh
> }
>
> sbinfo = SHMEM_SB(inode->i_sb);
> --
> 1.8.3.1
Hi Ming,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linux/master]
[also build test WARNING on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/0b6b8b44f566199698248899d0fef7466ba6b0f3
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
git checkout 0b6b8b44f566199698248899d0fef7466ba6b0f3
# save the attached .config to linux build tree
make W=1 ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All warnings (new ones prefixed by >>):
In file included from drivers/infiniband/hw/hfi1/trace.h:57,
from drivers/infiniband/hw/hfi1/file_ops.c:61:
drivers/infiniband/hw/hfi1/file_ops.c: In function 'hfi1_file_mmap':
>> drivers/infiniband/hw/hfi1/file_ops.c:572:5: warning: format '%lx' expects argument of type 'long unsigned int', but argument 11 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
572 | "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
573 | ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
574 | vma->vm_end - vma->vm_start, vma->vm_flags);
| ~~~~~~~~~~~~~
| |
| vm_flags_t {aka long long unsigned int}
drivers/infiniband/hw/hfi1/trace_dbg.h:133:33: note: in definition of macro 'hfi1_cdbg'
133 | __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
| ^~~
drivers/infiniband/hw/hfi1/file_ops.c:572:70: note: format string is defined here
572 | "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
| ~~^
| |
| long unsigned int
| %llx
--
In file included from include/linux/device.h:15,
from include/linux/pci.h:37,
from drivers/infiniband/hw/qib/qib_file_ops.c:35:
drivers/infiniband/hw/qib/qib_file_ops.c: In function 'mmap_rcvegrbufs':
>> drivers/infiniband/hw/qib/qib_file_ops.c:849:4: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
849 | "Can't map eager buffers as writable (flags=%lx)\n",
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/dev_printk.h:19:22: note: in definition of macro 'dev_fmt'
19 | #define dev_fmt(fmt) fmt
| ^~~
drivers/infiniband/hw/qib/qib.h:1472:2: note: in expansion of macro 'dev_info'
1472 | dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
| ^~~~~~~~
drivers/infiniband/hw/qib/qib_file_ops.c:848:3: note: in expansion of macro 'qib_devinfo'
848 | qib_devinfo(dd->pcidev,
| ^~~~~~~~~~~
drivers/infiniband/hw/qib/qib_file_ops.c:849:50: note: format string is defined here
849 | "Can't map eager buffers as writable (flags=%lx)\n",
| ~~^
| |
| long unsigned int
| %llx
In file included from include/linux/device.h:15,
from include/linux/pci.h:37,
from drivers/infiniband/hw/qib/qib_file_ops.c:35:
drivers/infiniband/hw/qib/qib_file_ops.c: In function 'mmap_kvaddr':
drivers/infiniband/hw/qib/qib_file_ops.c:938:6: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
938 | "Can't map eager buffers as writable (flags=%lx)\n",
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/dev_printk.h:19:22: note: in definition of macro 'dev_fmt'
19 | #define dev_fmt(fmt) fmt
| ^~~
drivers/infiniband/hw/qib/qib.h:1472:2: note: in expansion of macro 'dev_info'
1472 | dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
| ^~~~~~~~
drivers/infiniband/hw/qib/qib_file_ops.c:937:4: note: in expansion of macro 'qib_devinfo'
937 | qib_devinfo(dd->pcidev,
| ^~~~~~~~~~~
drivers/infiniband/hw/qib/qib_file_ops.c:938:52: note: format string is defined here
938 | "Can't map eager buffers as writable (flags=%lx)\n",
| ~~^
| |
| long unsigned int
| %llx
--
In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_priv.h:48,
from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:38:
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c: In function 'kfd_mmio_mmap':
>> drivers/gpu/drm/amd/amdgpu/../amdgpu/amdgpu.h:35:21: warning: format '%lX' expects argument of type 'long unsigned int', but argument 6 has type 'vm_flags_t' {aka 'long long unsigned int'} [-Wformat=]
35 | #define pr_fmt(fmt) "amdgpu: " fmt
| ^~~~~~~~~~
include/linux/dynamic_debug.h:129:15: note: in expansion of macro 'pr_fmt'
129 | func(&id, ##__VA_ARGS__); \
| ^~~~~~~~~~~
include/linux/dynamic_debug.h:147:2: note: in expansion of macro '__dynamic_func_call'
147 | __dynamic_func_call(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/dynamic_debug.h:157:2: note: in expansion of macro '_dynamic_func_call'
157 | _dynamic_func_call(fmt, __dynamic_pr_debug, \
| ^~~~~~~~~~~~~~~~~~
include/linux/printk.h:424:2: note: in expansion of macro 'dynamic_pr_debug'
424 | dynamic_pr_debug(fmt, ##__VA_ARGS__)
| ^~~~~~~~~~~~~~~~
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:1957:2: note: in expansion of macro 'pr_debug'
1957 | pr_debug("pasid 0x%x mapping mmio page\n"
| ^~~~~~~~
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_chardev.c:1960:39: note: format string is defined here
1960 | " vm_flags == 0x%04lX\n"
| ~~~~^
| |
| long unsigned int
| %04llX
vim +572 drivers/infiniband/hw/hfi1/file_ops.c
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 347
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 348 static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 349 {
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 350 struct hfi1_filedata *fd = fp->private_data;
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 351 struct hfi1_ctxtdata *uctxt = fd->uctxt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 352 struct hfi1_devdata *dd;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 353 unsigned long flags;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 354 u64 token = vma->vm_pgoff << PAGE_SHIFT,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 355 memaddr = 0;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 356 void *memvirt = NULL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 357 u8 subctxt, mapio = 0, vmf = 0, type;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 358 ssize_t memlen = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 359 int ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 360 u16 ctxt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 361
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 362 if (!is_valid_mmap(token) || !uctxt ||
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 363 !(vma->vm_flags & VM_SHARED)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 364 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 365 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 366 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 367 dd = uctxt->dd;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 368 ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 369 subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 370 type = HFI1_MMAP_TOKEN_GET(TYPE, token);
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 371 if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 372 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 373 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 374 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 375
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 376 flags = vma->vm_flags;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 377
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 378 switch (type) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 379 case PIO_BUFS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 380 case PIO_BUFS_SOP:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 381 memaddr = ((dd->physaddr + TXE_PIO_SEND) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 382 /* chip pio base */
d32cf44a62716d drivers/staging/rdma/hfi1/file_ops.c Amitoj Kaur Chawla 2015-10-16 383 (uctxt->sc->hw_context * BIT(16))) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 384 /* 64K PIO space / ctxt */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 385 (type == PIO_BUFS_SOP ?
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 386 (TXE_PIO_SIZE / 2) : 0); /* sop? */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 387 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 388 * Map only the amount allocated to the context, not the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 389 * entire available context's PIO space.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 390 */
437b29d1159af1 drivers/staging/rdma/hfi1/file_ops.c Amitoj Kaur Chawla 2016-03-04 391 memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 392 flags &= ~VM_MAYREAD;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 393 flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 394 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 395 mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 396 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 397 case PIO_CRED:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 398 if (flags & VM_WRITE) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 399 ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 400 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 401 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 402 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 403 * The credit return location for this context could be on the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 404 * second or third page allocated for credit returns (if number
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 405 * of enabled contexts > 64 and 128 respectively).
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 406 */
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 407 memvirt = dd->cr_base[uctxt->numa_id].va;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 408 memaddr = virt_to_phys(memvirt) +
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 409 (((u64)uctxt->sc->hw_free -
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 410 (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 411 memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 412 flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 413 flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 414 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 415 * The driver has already allocated memory for credit
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 416 * returns and programmed it into the chip. Has that
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 417 * memory been flagged as non-cached?
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 418 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 419 /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 420 mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 421 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 422 case RCV_HDRQ:
b25784312840bc drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn 2018-06-20 423 memlen = rcvhdrq_size(uctxt);
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 424 memvirt = uctxt->rcvhdrq;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 425 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 426 case RCV_EGRBUF: {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 427 unsigned long addr;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 428 int i;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 429 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 430 * The RcvEgr buffer need to be handled differently
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 431 * as multiple non-contiguous pages need to be mapped
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 432 * into the user process.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 433 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 434 memlen = uctxt->egrbufs.size;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 435 if ((vma->vm_end - vma->vm_start) != memlen) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 436 dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 437 (vma->vm_end - vma->vm_start), memlen);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 438 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 439 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 440 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 441 if (vma->vm_flags & VM_WRITE) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 442 ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 443 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 444 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 445 vma->vm_flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 446 addr = vma->vm_start;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 447 for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 448 memlen = uctxt->egrbufs.buffers[i].len;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 449 memvirt = uctxt->egrbufs.buffers[i].addr;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 450 ret = remap_pfn_range(
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 451 vma, addr,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 452 /*
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 453 * virt_to_pfn() does the same, but
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 454 * it's not available on x86_64
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 455 * when CONFIG_MMU is enabled.
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 456 */
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 457 PFN_DOWN(__pa(memvirt)),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 458 memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 459 vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 460 if (ret < 0)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 461 goto done;
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 462 addr += memlen;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 463 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 464 ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 465 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 466 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 467 case UREGS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 468 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 469 * Map only the page that contains this context's user
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 470 * registers.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 471 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 472 memaddr = (unsigned long)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 473 (dd->physaddr + RXE_PER_CONTEXT_USER)
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 474 + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 475 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 476 * TidFlow table is on the same page as the rest of the
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 477 * user registers.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 478 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 479 memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 480 flags |= VM_DONTCOPY | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 481 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 482 mapio = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 483 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 484 case EVENTS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 485 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 486 * Use the page where this context's flags are. User level
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 487 * knows where it's own bitmap is within the page.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 488 */
21e5acc06403f6 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl 2017-09-26 489 memaddr = (unsigned long)
21e5acc06403f6 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl 2017-09-26 490 (dd->events + uctxt_offset(uctxt)) & PAGE_MASK;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 491 memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 492 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 493 * v3.7 removes VM_RESERVED but the effect is kept by
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 494 * using VM_IO.
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 495 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 496 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 497 vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 498 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 499 case STATUS:
7709b0dc265f28 drivers/infiniband/hw/hfi1/file_ops.c Michael J. Ruhl 2019-01-17 500 if (flags & VM_WRITE) {
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny 2017-04-09 501 ret = -EPERM;
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny 2017-04-09 502 goto done;
12220267645cb7 drivers/infiniband/hw/hfi1/file_ops.c Ira Weiny 2017-04-09 503 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 504 memaddr = kvirt_to_phys((void *)dd->status);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 505 memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 506 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 507 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 508 case RTAIL:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 509 if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 510 /*
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 511 * If the memory allocation failed, the context alloc
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 512 * also would have failed, so we would never get here
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 513 */
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 514 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 515 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 516 }
2fb3b5ae1ca771 drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn 2019-12-19 517 if ((flags & VM_WRITE) || !hfi1_rcvhdrtail_kvaddr(uctxt)) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 518 ret = -EPERM;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 519 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 520 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 521 memlen = PAGE_SIZE;
2fb3b5ae1ca771 drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn 2019-12-19 522 memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 523 flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 524 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 525 case SUBCTXT_UREGS:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 526 memaddr = (u64)uctxt->subctxt_uregbase;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 527 memlen = PAGE_SIZE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 528 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 529 vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 530 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 531 case SUBCTXT_RCV_HDRQ:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 532 memaddr = (u64)uctxt->subctxt_rcvhdr_base;
b25784312840bc drivers/infiniband/hw/hfi1/file_ops.c Mike Marciniszyn 2018-06-20 533 memlen = rcvhdrq_size(uctxt) * uctxt->subctxt_cnt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 534 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 535 vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 536 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 537 case SUBCTXT_EGRBUF:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 538 memaddr = (u64)uctxt->subctxt_rcvegrbuf;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 539 memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 540 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 541 flags &= ~VM_MAYWRITE;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 542 vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 543 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 544 case SDMA_COMP: {
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 545 struct hfi1_user_sdma_comp_q *cq = fd->cq;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 546
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 547 if (!cq) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 548 ret = -EFAULT;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 549 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 550 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 551 memaddr = (u64)cq->comps;
437b29d1159af1 drivers/staging/rdma/hfi1/file_ops.c Amitoj Kaur Chawla 2016-03-04 552 memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 553 flags |= VM_IO | VM_DONTEXPAND;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 554 vmf = 1;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 555 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 556 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 557 default:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 558 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 559 break;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 560 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 561
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 562 if ((vma->vm_end - vma->vm_start) != memlen) {
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 563 hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
9e10af4787ac51 drivers/staging/rdma/hfi1/file_ops.c Ira Weiny 2015-10-30 564 uctxt->ctxt, fd->subctxt,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 565 (vma->vm_end - vma->vm_start), memlen);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 566 ret = -EINVAL;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 567 goto done;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 568 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 569
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 570 vma->vm_flags = flags;
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c Sebastian Sanchez 2015-11-06 571 hfi1_cdbg(PROC,
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c Sebastian Sanchez 2015-11-06 @572 "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
6c63e4238acad0 drivers/staging/rdma/hfi1/file_ops.c Sebastian Sanchez 2015-11-06 573 ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 574 vma->vm_end - vma->vm_start, vma->vm_flags);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 575 if (vmf) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 576 vma->vm_pgoff = PFN_DOWN(memaddr);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 577 vma->vm_ops = &vm_ops;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 578 ret = 0;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 579 } else if (mapio) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 580 ret = io_remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 581 PFN_DOWN(memaddr),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 582 memlen,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 583 vma->vm_page_prot);
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 584 } else if (memvirt) {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 585 ret = remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 586 PFN_DOWN(__pa(memvirt)),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 587 memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 588 vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 589 } else {
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 590 ret = remap_pfn_range(vma, vma->vm_start,
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 591 PFN_DOWN(memaddr),
60368186fd8538 drivers/infiniband/hw/hfi1/file_ops.c Tymoteusz Kielan 2016-09-06 592 memlen,
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 593 vma->vm_page_prot);
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 594 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 595 done:
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 596 return ret;
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 597 }
7724105686e718 drivers/staging/rdma/hfi1/file_ops.c Mike Marciniszyn 2015-07-30 598
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Hi Ming,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linux/master]
[also build test ERROR on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: s390-randconfig-r011-20210601 (attached as .config)
compiler: s390-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/0b6b8b44f566199698248899d0fef7466ba6b0f3
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
git checkout 0b6b8b44f566199698248899d0fef7466ba6b0f3
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=s390
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
arch/s390/mm/gmap.c: In function 'gmap_mark_unmergeable':
>> arch/s390/mm/gmap.c:2577:25: error: passing argument 5 of 'ksm_madvise' from incompatible pointer type [-Werror=incompatible-pointer-types]
2577 | MADV_UNMERGEABLE, &vma->vm_flags);
| ^~~~~~~~~~~~~~
| |
| vm_flags_t * {aka long long unsigned int *}
In file included from arch/s390/mm/gmap.c:18:
include/linux/ksm.h:70:49: note: expected 'long unsigned int *' but argument is of type 'vm_flags_t *' {aka 'long long unsigned int *'}
70 | unsigned long end, int advice, unsigned long *vm_flags)
| ~~~~~~~~~~~~~~~^~~~~~~~
cc1: some warnings being treated as errors
vim +/ksm_madvise +2577 arch/s390/mm/gmap.c
1e133ab296f3ff Martin Schwidefsky 2016-03-08 2568
fa0c5eabbdd330 Janosch Frank 2019-07-16 2569 int gmap_mark_unmergeable(void)
fa0c5eabbdd330 Janosch Frank 2019-07-16 2570 {
fa0c5eabbdd330 Janosch Frank 2019-07-16 2571 struct mm_struct *mm = current->mm;
fa0c5eabbdd330 Janosch Frank 2019-07-16 2572 struct vm_area_struct *vma;
7a2653612bb6f1 Christian Borntraeger 2020-03-27 2573 int ret;
fa0c5eabbdd330 Janosch Frank 2019-07-16 2574
fa0c5eabbdd330 Janosch Frank 2019-07-16 2575 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7a2653612bb6f1 Christian Borntraeger 2020-03-27 2576 ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
7a2653612bb6f1 Christian Borntraeger 2020-03-27 @2577 MADV_UNMERGEABLE, &vma->vm_flags);
7a2653612bb6f1 Christian Borntraeger 2020-03-27 2578 if (ret)
7a2653612bb6f1 Christian Borntraeger 2020-03-27 2579 return ret;
fa0c5eabbdd330 Janosch Frank 2019-07-16 2580 }
fa0c5eabbdd330 Janosch Frank 2019-07-16 2581 mm->def_flags &= ~VM_MERGEABLE;
fa0c5eabbdd330 Janosch Frank 2019-07-16 2582 return 0;
fa0c5eabbdd330 Janosch Frank 2019-07-16 2583 }
fa0c5eabbdd330 Janosch Frank 2019-07-16 2584 EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
fa0c5eabbdd330 Janosch Frank 2019-07-16 2585
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Hi Ming,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linux/master]
[also build test WARNING on arm64/for-next/core powerpc/next asm-generic/master linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master tip/x86/core next-20210601]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Ming-Lin/mm-adds-MAP_NOSIGBUS-extension-for-shmem-read/20210602-072403
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
cd tools/perf && ./check-headers.sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
perfheadercheck warnings: (new ones prefixed by >>)
>> Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h': 32> #define MAP_NOSIGBUS 0x200000 /* do not SIGBUS on out-of-band shmem read */
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
On 6/1/2021 8:49 PM, Hugh Dickins wrote:
>> index 096bba4..69cd856 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -1419,6 +1419,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>> if (!len)
>> return -EINVAL;
>>
>> + if ((flags & MAP_NOSIGBUS) && ((prot & PROT_WRITE) || !shmem_file(file)))
>> + return -EINVAL;
>> +
>
> No, for several reasons.
>
> This has nothing to do with shmem really, that's just where this patch
> hacks it in - and where you have a first user in mind. If this goes
> forward, please modify mm/memory.c not mm/shmem.c, to make
> VM_FAULT_SIGBUS on fault to VM_NOSIGBUS vma do the mapping of zero page.
>
> (prot & PROT_WRITE) tells you about the mmap() flags, but says nothing
> about what mprotect() could do later on. Look out for VM_SHARED and
> VM_MAYSHARE and VM_MAYWRITE further down; and beware the else (!file)
> block below them, shared anonymous would need more protection too.
>
> Constructive comment: I guess much of my objection to this feature
> comes from allowing it in the MAP_SHARED case. If you restrict it
> to MAP_PRIVATE mapping of file, then it's less objectionable, and
> you won't have to worry (so much?) about write protection. Copy
> on write is normal there, and it's well established that subsequent
> changes in the file will not be shared; you'd just be extending that
> behaviour from writes to sigbusy reads.
>
> And by restricting to MAP_PRIVATE, you would allow for adding a
> proper MAP_SHARED implementation later, if it's thought useful
> (that being the implementation which can subsequently unmap a
> zero page to let new page cache be mapped).
This is what I wrote so far.
---
include/linux/mm.h | 2 ++
include/linux/mman.h | 1 +
include/uapi/asm-generic/mman-common.h | 1 +
mm/memory.c | 12 ++++++++++++
mm/mmap.c | 4 ++++
5 files changed, 20 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..af9e277 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#define VM_NOSIGBUS VM_FLAGS_BIT(38) /* Do not SIGBUS on fault */
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
+ _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
arch_calc_vm_flag_bits(flags);
}
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..a2a5333 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
#define MAP_HUGETLB 0x040000 /* create a huge page mapping */
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS 0x200000 /* do not SIGBUS on fault */
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
diff --git a/mm/memory.c b/mm/memory.c
index eff2a47..7195dac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3676,6 +3676,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
}
ret = vma->vm_ops->fault(vmf);
+ if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS)) {
+ /*
+ * Get zero page for MAP_NOSIGBUS mapping, which isn't
+ * coherent wrt shmem contents that are expanded and
+ * filled in later.
+ */
+ vma->vm_flags |= VM_MIXEDMAP;
+ if (!vm_insert_page(vma, (unsigned long)vmf->address,
+ ZERO_PAGE(vmf->address)))
+ return VM_FAULT_NOPAGE;
+ }
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..74fb49a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!len)
return -EINVAL;
+ /* Restrict MAP_NOSIGBUS to MAP_PRIVATE mapping */
+ if ((flags & MAP_NOSIGBUS) && !(flags & MAP_PRIVATE))
+ return -EINVAL;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
>
>> /*
>> * Does the application expect PROT_READ to imply PROT_EXEC?
>> *
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 5d46611..5d15b08 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1812,7 +1812,22 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>> repeat:
>> if (sgp <= SGP_CACHE &&
>> ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
>> - return -EINVAL;
>> + if (!vma || !(vma->vm_flags & VM_NOSIGBUS))
>> + return -EINVAL;
>> +
>> + vma->vm_flags |= VM_MIXEDMAP;
>
> No. Presumably you hit the BUG_ON(mmap_read_trylock(vma->vm_mm))
> in vm_insert_page(), so decided to modify the vm_flags here: no,
> that BUG is saying you need mmap_write_lock() to write vm_flags.
But the comments above vm_insert_page() told me to set VM_MIXEDMAP on vma
* Usually this function is called from f_op->mmap() handler
* under mm->mmap_lock write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
>
> One other thing while it crosses my mind. You'll need to decide
> what truncating or hole-punching the file does to the zero pages
> in its userspace mappings. I may turn out wrong, but I think you'll
> find that truncation removes them, but hole-punch leaves them, and
> ought to be modified to remove them too (it's a matter of how the
> "even_cows" arg to unmap_mapping_range() is treated).
I did a quick test, after inserting zero pages, seems that truncation
also leaves the mappings.
I'm still reading code to learn this part ...
On Wed, 2 Jun 2021, Ming Lin wrote:
>
> This is what I wrote so far.
>
> ---
> include/linux/mm.h | 2 ++
> include/linux/mman.h | 1 +
> include/uapi/asm-generic/mman-common.h | 1 +
> mm/memory.c | 12 ++++++++++++
> mm/mmap.c | 4 ++++
> 5 files changed, 20 insertions(+)
I have not looked at the rest, just looking at mm/memory.c:
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3676,6 +3676,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
> }
> ret = vma->vm_ops->fault(vmf);
> + if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS))
> {
> + /*
> + * Get zero page for MAP_NOSIGBUS mapping, which isn't
> + * coherent wrt shmem contents that are expanded and
> + * filled in later.
> + */
> + vma->vm_flags |= VM_MIXEDMAP;
> + if (!vm_insert_page(vma, (unsigned long)vmf->address,
> + ZERO_PAGE(vmf->address)))
> + return VM_FAULT_NOPAGE;
> + }
> +
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY
> |
> VM_FAULT_DONE_COW)))
> return ret;
Sorry, I directed you to mm/memory.c without indicating what's
appropriate here. Please don't attempt to use VM_MIXEDMAP and
vm_insert_page(): they're for special driver mmaps, they're no
better here than they were in mm/shmem.c.
It's do_anonymous_page()'s business to map in the zero page on
read fault (see "my_zero_pfn(vmf->address)" in there), or fill
a freshly allocated page with zeroes on write fault - and now
you're sticking to MAP_PRIVATE, write faults in VM_WRITE areas
are okay for VM_NOSIGBUS.
Ideally you can simply call do_anonymous_page() from __do_fault()
in the VM_FAULT_SIGBUS on VM_NOSIGBUS case. That's what to start
from anyway: but look to see if there's state to be adjusted to
achieve that; and it won't be surprising if somewhere down in
do_anonymous_page() or something it calls, there's a BUG on it
being called when vma->vm_file is set, or something like that.
May need some tweaking.
Hugh
On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <[email protected]> wrote:
>
> Ideally you can simply call do_anonymous_page() from __do_fault()
> in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.
Heh.
We're actually then back to my original patch.
That one doesn't handle shared mappings (even read-only ones), for the
simple reason that do_anonymous_page() refuses to insert anonymous
pages into a shared mapping, and has
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
at the very top.
But yes, if we just remove that check, I think my original patch
should actually "JustWork(tm)".
I'm attaching it again, with old name and old commentary (ie that
/* FIXME! We don't have a VM_NOFAULT bit */
should just be replaced with that VM_NOSIGBUS bit instead, and the
#if'ed out region should be enabled.
Oh, and we need to think hard about one more case: mprotect().
In particular, I think the attached patch fails horribly for the case
of a shared mapping that starts out read-only, then inserts a zero
page, then somebody does mprotect(MAP_WRITE), and then writes to the
page. I haven't checked what the write protect fault handler does, but
I think that for a shared mapping it will just make the page dirty and
writable.
Which would be horribly wrong for VM_NOSIGBUS.
So that support infrastructure that adds MAP_NOSIGBUS, and checks that
it is only done on a read-only mapping, also has to make sure that it
clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.
That way mprotect can't then later make it writable.
Hugh, comments on this approach?
Again: this patch is my *OLD* one, I didn't try to update it to the
new world order. It requires
- Ming's MAP_NOSIGBUS ccode
- removal of that "File mapping without ->vm_ops" case
- that FIXME fixed and name updated
- and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
mprotect issue.
Hmm?
Linus
On Thu, 3 Jun 2021, Linus Torvalds wrote:
> On Wed, Jun 2, 2021 at 5:46 PM Hugh Dickins <[email protected]> wrote:
> >
> > Ideally you can simply call do_anonymous_page() from __do_fault()
> > in the VM_FAULT_SIGBUS on VM_NOSIGBUS case.
>
> Heh.
>
> We're actually then back to my original patch.
>
> That one doesn't handle shared mappings (even read-only ones), for the
> simple reason that do_anonymous_page() refuses to insert anonymous
> pages into a shared mapping, and has
>
> /* File mapping without ->vm_ops ? */
> if (vma->vm_flags & VM_SHARED)
> return VM_FAULT_SIGBUS;
>
> at the very top.
>
> But yes, if we just remove that check, I think my original patch
> should actually "JustWork(tm)".
But no!
Sorry, I don't have time for this at present, so haven't looked at
your original patch.
But the point that we've arrived at, that I'm actually now fairly
happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.
I didn't check the placement yet, easy to get wrong, but I believe
Ming Lin is now enforcing that over at the mmap() end.
On a MAP_PRIVATE mapping, the nasty opaque blob of zeroes can
claim some precedent in what already happens with COW'ed pages.
Which leaves MAP_NOSIGBUS on MAP_SHARED as currently unsupported,
perhaps never supported on anything, perhaps one day supported on
shmem; but if it's ever supported then that one will naturally be
transparent to future changes in page cache - we call that "shared".
Of course, internally, there's the in-between case of MAP_SHARED
without PROT_WRITE and without writable fd: VM_MAYSHARE without
VM_SHARED or VM_MAYWRITE. We *could* let that one accept
MAP_NOSIGBUS, but who wants to write the manpage for it?
Please stick to MAP_PRIVATE: that's good enough.
>
> I'm attaching it again, with old name and old commentary (ie that
>
> /* FIXME! We don't have a VM_NOFAULT bit */
>
> should just be replaced with that VM_NOSIGBUS bit instead, and the
> #if'ed out region should be enabled.
>
> Oh, and we need to think hard about one more case: mprotect().
>
> In particular, I think the attached patch fails horribly for the case
> of a shared mapping that starts out read-only, then inserts a zero
> page, then somebody does mprotect(MAP_WRITE), and then writes to the
> page. I haven't checked what the write protect fault handler does, but
> I think that for a shared mapping it will just make the page dirty and
> writable.
Obviously the finished patch will need to be scrutinized carefully, but
I think the mprotect() questions vanish when restricted to MAP_PRIVATE.
>
> Which would be horribly wrong for VM_NOSIGBUS.
>
> So that support infrastructure that adds MAP_NOSIGBUS, and checks that
> it is only done on a read-only mapping, also has to make sure that it
> clears the VM_MAYWRITE bit when it sets VM_NOSIGBUS.
>
> That way mprotect can't then later make it writable.
>
> Hugh, comments on this approach?
Comments above, just stick to MAP_PRIVATE.
Hugh
>
> Again: this patch is my *OLD* one, I didn't try to update it to the
> new world order. It requires
>
> - Ming's MAP_NOSIGBUS ccode
>
> - removal of that "File mapping without ->vm_ops" case
>
> - that FIXME fixed and name updated
>
> - and that VM_MAYWRITE clearing if VM_NOSIGBUS is set, to avoid the
> mprotect issue.
>
> Hmm?
>
> Linus
On Thu, Jun 3, 2021 at 12:07 PM Hugh Dickins <[email protected]> wrote:
>
> But the point that we've arrived at, that I'm actually now fairly
> happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.
Yeah, if that's sufficient, then that original patch should just work as-is.
But there was some reason why people didn't like that patch
originally, and I think it was literally about how it only worked on
private mappings (the "we don't have a flag for it in the vm_flags"
part was just a small detail.
I guess that objection ended up changing over time.
Linus
On Thu, Jun 3, 2021 at 12:12 PM Linus Torvalds
<[email protected]> wrote:
>
> Yeah, if that's sufficient, then that original patch should just work as-is.
To clarify: it obviously needs the VM_xyz flags things, but the
VM_SHARED check in do_anonymous_page() is fine, and the whole issue
with VM_MAYWRITE is entirely moot.
MAP_PRIVATE works fine with zero pages even when writable - they get
COW'ed properly, of course.
Linus
> On Jun 3, 2021, at 12:14 PM, Linus Torvalds <[email protected]> wrote:
>
> On Thu, Jun 3, 2021 at 12:07 PM Hugh Dickins <[email protected]> wrote:
>>
>> But the point that we've arrived at, that I'm actually now fairly
>> happy with, is do *not* permit MAP_NOSIGBUS on MAP_SHARED mappings.
>
> Yeah, if that's sufficient, then that original patch should just work as-is.
>
> But there was some reason why people didn't like that patch
> originally, and I think it was literally about how it only worked on
> private mappings (the "we don't have a flag for it in the vm_flags"
> part was just a small detail.
>
> I guess that objection ended up changing over time.
>
>
I don’t understand the use case well enough to comment on whether MAP_PRIVATE is sufficient, but I’m with Hugh: if this feature is implemented for MAP_SHARED, it should be fully coherent.
On Thursday, June 3rd, 2021 at 9:24 PM, Andy Lutomirski <[email protected]> wrote:
> I don’t understand the use case well enough to comment on whether MAP_PRIVATE
> is sufficient, but I’m with Hugh: if this feature is implemented for
> MAP_SHARED, it should be fully coherent.
I've tried to explain what we'd need from user-space PoV in [1].
tl;dr the MAP_PRIVATE restriction would get us pretty far, even if it
won't allow us to have all of the bells and whistles.
[1]: https://lore.kernel.org/linux-mm/vs1Us2sm4qmfvLOqNat0-r16GyfmWzqUzQ4KHbXJwEcjhzeoQ4sBTxx7QXDG9B6zk5AeT7FsNb3CSr94LaKy6Novh1fbbw8D_BBxYsbPLms=@emersion.fr/T/#mb321a8d39e824740877ba95f1df780ffd52c3862
On 6/2/2021 5:46 PM, Hugh Dickins wrote
>
> It's do_anonymous_page()'s business to map in the zero page on
> read fault (see "my_zero_pfn(vmf->address)" in there), or fill
> a freshly allocated page with zeroes on write fault - and now
> you're sticking to MAP_PRIVATE, write faults in VM_WRITE areas
> are okay for VM_NOSIGBUS.
>
> Ideally you can simply call do_anonymous_page() from __do_fault()
> in the VM_FAULT_SIGBUS on VM_NOSIGBUS case. That's what to start
> from anyway: but look to see if there's state to be adjusted to
> achieve that; and it won't be surprising if somewhere down in
> do_anonymous_page() or something it calls, there's a BUG on it
> being called when vma->vm_file is set, or something like that.
> May need some tweaking.
do_anonymous_page() works nicely for read fault and write fault.
I didn't see any BUG() thing in my test.
But I'm still struggling with how to do "punch hole should remove the mapping of zero page".
Here is the hack I have now.
diff --git a/mm/memory.c b/mm/memory.c
index 46ecda5..6b5a897 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1241,7 +1241,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct page *page;
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
+ if (unlikely(details) && page && !(vma->vm_flags & VM_NOSIGBUS)) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
And other parts of the patch is following,
----
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9d67bc..af9e277 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -373,6 +373,8 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#define VM_NOSIGBUS VM_FLAGS_BIT(38) /* Do not SIGBUS on fault */
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index b2cbae9..c966b08 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -154,6 +154,7 @@ static inline bool arch_validate_flags(unsigned long flags)
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
+ _calc_vm_trans(flags, MAP_NOSIGBUS, VM_NOSIGBUS ) |
arch_calc_vm_flag_bits(flags);
}
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d..a2a5333 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -29,6 +29,7 @@
#define MAP_HUGETLB 0x040000 /* create a huge page mapping */
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_NOSIGBUS 0x200000 /* do not SIGBUS on fault */
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
diff --git a/mm/memory.c b/mm/memory.c
index eff2a47..46ecda5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3676,6 +3676,17 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
}
ret = vma->vm_ops->fault(vmf);
+ if (unlikely(ret & VM_FAULT_SIGBUS) && (vma->vm_flags & VM_NOSIGBUS)) {
+ /*
+ * For MAP_NOSIGBUS mapping, map in the zero page on read fault
+ * or fill a freshly allocated page with zeroes on write fault
+ */
+ ret = do_anonymous_page(vmf);
+ if (!ret)
+ ret = VM_FAULT_NOPAGE;
+ return ret;
+ }
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 096bba4..74fb49a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1419,6 +1419,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!len)
return -EINVAL;
+ /* Restrict MAP_NOSIGBUS to MAP_PRIVATE mapping */
+ if ((flags & MAP_NOSIGBUS) && !(flags & MAP_PRIVATE))
+ return -EINVAL;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*