2009-12-10 13:58:19

by David Howells

[permalink] [raw]
Subject: [PATCH 1/7] security: do not check mmap_min_addr on nommu systems

From: Eric Paris <[email protected]>

nommu systems can do anything with memory they please and so they already
win. mmap_min_addr is the least of their worries. Currently the
mmap_min_addr implementation is problamatic on such systems. This patch
changes the addr_only argument to be a flags which can take the arguments
for addr_only or not_addr. LSMs then need to properly implement these two
flags.

Signed-off-by: Eric Paris <[email protected]>
---

include/linux/security.h | 20 ++++++++++++++------
mm/mmap.c | 6 ++++--
mm/mremap.c | 6 ++++--
mm/nommu.c | 3 ++-
security/commoncap.c | 7 ++++---
security/security.c | 5 +++--
security/selinux/hooks.c | 9 +++++----
7 files changed, 36 insertions(+), 20 deletions(-)


diff --git a/include/linux/security.h b/include/linux/security.h
index 466cbad..e7dc87c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -43,6 +43,10 @@
#define SECURITY_CAP_NOAUDIT 0
#define SECURITY_CAP_AUDIT 1

+/* sec_flags for security_file_mmap */
+#define SECURITY_MMAP_ONLY_ADDR_CHECK 0x01
+#define SECURITY_MMAP_SKIP_ADDR_CHECK 0x02
+
struct ctl_table;
struct audit_krule;

@@ -69,7 +73,7 @@ extern int cap_inode_need_killpriv(struct dentry *dentry);
extern int cap_inode_killpriv(struct dentry *dentry);
extern int cap_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
- unsigned long addr, unsigned long addr_only);
+ unsigned long addr, unsigned long sec_flags);
extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
@@ -604,11 +608,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
* Return 0 if permission is granted.
* @file_mmap :
* Check permissions for a mmap operation. The @file may be NULL, e.g.
- * if mapping anonymous memory.
+ * if mapping anonymous memory. This actually performs 2 seperate types
+ * of checks. It first checks permissions on the file in question (if
+ * it exists) and it also checks if the address is allowed.
* @file contains the file structure for file to map (may be NULL).
* @reqprot contains the protection requested by the application.
* @prot contains the protection that will be applied by the kernel.
* @flags contains the operational flags.
+ * @addr address vm will map to
+ * @sec_flags which of the 2 types of checks should (not) be performed
* Return 0 if permission is granted.
* @file_mprotect:
* Check permissions before changing memory access permissions.
@@ -1556,7 +1564,7 @@ struct security_operations {
int (*file_mmap) (struct file *file,
unsigned long reqprot, unsigned long prot,
unsigned long flags, unsigned long addr,
- unsigned long addr_only);
+ unsigned long sec_flags);
int (*file_mprotect) (struct vm_area_struct *vma,
unsigned long reqprot,
unsigned long prot);
@@ -1825,7 +1833,7 @@ void security_file_free(struct file *file);
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int security_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
- unsigned long addr, unsigned long addr_only);
+ unsigned long addr, unsigned long sec_flags);
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
unsigned long prot);
int security_file_lock(struct file *file, unsigned int cmd);
@@ -2321,9 +2329,9 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot,
unsigned long flags,
unsigned long addr,
- unsigned long addr_only)
+ unsigned long sec_flags)
{
- return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
+ return cap_file_mmap(file, reqprot, prot, flags, addr, sec_flags);
}

static inline int security_file_mprotect(struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 292ddc3..6b933e9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1664,7 +1664,8 @@ static int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;

address &= PAGE_MASK;
- error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ error = security_file_mmap(NULL, 0, 0, 0, address,
+ SECURITY_MMAP_ONLY_ADDR_CHECK);
if (error)
return error;

@@ -2005,7 +2006,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (is_hugepage_only_range(mm, addr, len))
return -EINVAL;

- error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
+ error = security_file_mmap(NULL, 0, 0, 0, addr,
+ SECURITY_MMAP_ONLY_ADDR_CHECK);
if (error)
return error;

diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff25..6731bcb 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -313,7 +313,8 @@ unsigned long do_mremap(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;

- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr,
+ SECURITY_MMAP_ONLY_ADDR_CHECK);
if (ret)
goto out;

@@ -421,7 +422,8 @@ unsigned long do_mremap(unsigned long addr,
goto out;
}

- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr,
+ SECURITY_MMAP_ONLY_ADDR_CHECK);
if (ret)
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 9876fa0..0c0364f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -974,7 +974,8 @@ static int validate_mmap_request(struct file *file,
}

/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ ret = security_file_mmap(file, reqprot, prot, flags, 0,
+ SECURITY_MMAP_SKIP_ADDR_CHECK);
if (ret < 0)
return ret;

diff --git a/security/commoncap.c b/security/commoncap.c
index f800fdb..193e9fa 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -924,7 +924,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
* @prot: unused
* @flags: unused
* @addr: address attempting to be mapped
- * @addr_only: unused
+ * @sec_flags: should the addr be checked?
*
* If the process is attempting to map memory below mmap_min_addr they need
* CAP_SYS_RAWIO. The other parameters to this function are unused by the
@@ -933,11 +933,12 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
*/
int cap_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
- unsigned long addr, unsigned long addr_only)
+ unsigned long addr, unsigned long sec_flags)
{
int ret = 0;

- if (addr < dac_mmap_min_addr) {
+ if (!(sec_flags & SECURITY_MMAP_SKIP_ADDR_CHECK) &&
+ (addr < dac_mmap_min_addr)) {
ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO,
SECURITY_CAP_AUDIT);
/* set PF_SUPERPRIV if it turns out we allow the low mmap */
diff --git a/security/security.c b/security/security.c
index 24e060b..4746ce9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -677,11 +677,12 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)

int security_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
- unsigned long addr, unsigned long addr_only)
+ unsigned long addr, unsigned long sec_flags)
{
int ret;

- ret = security_ops->file_mmap(file, reqprot, prot, flags, addr, addr_only);
+ ret = security_ops->file_mmap(file, reqprot, prot, flags, addr,
+ sec_flags);
if (ret)
return ret;
return ima_file_mmap(file, prot);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7a374c2..c099ecb 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3042,7 +3042,7 @@ error:

static int selinux_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags,
- unsigned long addr, unsigned long addr_only)
+ unsigned long addr, unsigned long sec_flags)
{
int rc = 0;
u32 sid = current_sid();
@@ -3053,7 +3053,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
* at bad behaviour/exploit that we always want to get the AVC, even
* if DAC would have also denied the operation.
*/
- if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
+ if (!(sec_flags & SECURITY_MMAP_SKIP_ADDR_CHECK) &&
+ (addr < CONFIG_LSM_MMAP_MIN_ADDR)) {
rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
MEMPROTECT__MMAP_ZERO, NULL);
if (rc)
@@ -3061,8 +3062,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
}

/* do DAC check on address space usage */
- rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
- if (rc || addr_only)
+ rc = cap_file_mmap(file, reqprot, prot, flags, addr, sec_flags);
+ if (rc || (sec_flags & SECURITY_MMAP_ONLY_ADDR_CHECK))
return rc;

if (selinux_checkreqprot)


2009-12-10 13:58:28

by David Howells

[permalink] [raw]
Subject: [PATCH 2/7] NOMMU: Provide per-task stack usage through /proc for NOMMU

Make it possible to get the per-task stack usage through /proc on a NOMMU
system. The MMU-mode routine can't be used because walk_page_range() doesn't
work on NOMMU.

It can be tested to show the stack usages of non-kernel-thread processes:

# grep "Stack usage:" /proc/*/status | grep -v "0 kB"
/proc/1/status:Stack usage: 2 kB
/proc/57/status:Stack usage: 3 kB
/proc/58/status:Stack usage: 1 kB
/proc/59/status:Stack usage: 3 kB
/proc/60/status:Stack usage: 5 kB
/proc/self/status:Stack usage: 1 kB

I've only tested it with ELF-FDPIC, though it should work with FLAT too.

Signed-off-by: David Howells <[email protected]>
Cc: Stefani Seibold <[email protected]>
Cc: Mike Frysinger <[email protected]>
Cc: Paul Mundt <[email protected]>
Cc: Robin Getz <[email protected]>
---

fs/proc/array.c | 20 +++++++++++++++++---
1 files changed, 17 insertions(+), 3 deletions(-)


diff --git a/fs/proc/array.c b/fs/proc/array.c
index 4badde1..c67251e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -387,8 +387,7 @@ static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
return ss.usage;
}

-static inline void task_show_stack_usage(struct seq_file *m,
- struct task_struct *task)
+static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
{
struct vm_area_struct *vma;
struct mm_struct *mm = get_task_mm(task);
@@ -404,9 +403,24 @@ static inline void task_show_stack_usage(struct seq_file *m,
mmput(mm);
}
}
-#else
+#else /* CONFIG_MMU */
+/*
+ * Calculate the size of a NOMMU process's stack
+ */
static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
{
+ unsigned long sp, base, usage;
+
+ base = task->stack_start;
+ sp = KSTK_ESP(task);
+
+#ifdef CONFIG_STACK_GROWSUP
+ usage = sp - base;
+#else
+ usage = base - sp;
+#endif
+
+ seq_printf(m, "Stack usage:\t%lu kB\n", (usage + 1023) >> 10);
}
#endif /* CONFIG_MMU */

2009-12-10 13:59:23

by David Howells

[permalink] [raw]
Subject: [PATCH 3/7] NOMMU: fix malloc performance by adding uninitialized flag

From: Jie Zhang <[email protected]>

The NOMMU code currently clears all anonymous mmapped memory. While this
is what we want in the default case, all memory allocation from userspace
under NOMMU has to go through this interface, including malloc() which is
allowed to return uninitialized memory. This can easily be a significant
performance penalty. So for constrained embedded systems were security is
irrelevant, allow people to avoid clearing memory unnecessarily.

This also alters the ELF-FDPIC binfmt such that it obtains uninitialised
memory for the brk and stack region.

Signed-off-by: Jie Zhang <[email protected]>
Signed-off-by: Robin Getz <[email protected]>
Signed-off-by: Mike Frysinger <[email protected]>
Signed-off-by: David Howells <[email protected]>
---

Documentation/nommu-mmap.txt | 26 ++++++++++++++++++++++++++
fs/binfmt_elf_fdpic.c | 3 ++-
include/asm-generic/mman-common.h | 5 +++++
init/Kconfig | 22 ++++++++++++++++++++++
mm/nommu.c | 8 +++++---
5 files changed, 60 insertions(+), 4 deletions(-)


diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index b565e82..8e1ddec 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -119,6 +119,32 @@ FURTHER NOTES ON NO-MMU MMAP
granule but will only discard the excess if appropriately configured as
this has an effect on fragmentation.

+ (*) The memory allocated by a request for an anonymous mapping will normally
+ be cleared by the kernel before being returned in accordance with the
+ Linux man pages (ver 2.22 or later).
+
+ In the MMU case this can be achieved with reasonable performance as
+ regions are backed by virtual pages, with the contents only being mapped
+ to cleared physical pages when a write happens on that specific page
+ (prior to which, the pages are effectively mapped to the global zero page
+ from which reads can take place). This spreads out the time it takes to
+ initialize the contents of a page - depending on the write-usage of the
+ mapping.
+
+ In the no-MMU case, however, anonymous mappings are backed by physical
+ pages, and the entire map is cleared at allocation time. This can cause
+ significant delays during a userspace malloc() as the C library does an
+ anonymous mapping and the kernel then does a memset for the entire map.
+
+ However, for memory that isn't required to be precleared - such as that
+ returned by malloc() - mmap() can take a MAP_UNINITIALIZED flag to
+ indicate to the kernel that it shouldn't bother clearing the memory before
+ returning it. Note that CONFIG_MMAP_ALLOW_UNINITIALIZED must be enabled
+ to permit this, otherwise the flag will be ignored.
+
+ uClibc uses this to speed up malloc(), and the ELF-FDPIC binfmt uses this
+ to allocate the brk and stack region.
+
(*) A list of all the private copy and anonymous mappings on the system is
visible through /proc/maps in no-MMU mode.

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c6..79d2b1a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -380,7 +380,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
down_write(&current->mm->mmap_sem);
current->mm->start_brk = do_mmap(NULL, 0, stack_size,
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
+ MAP_PRIVATE | MAP_ANONYMOUS |
+ MAP_UNINITIALIZED | MAP_GROWSDOWN,
0);

if (IS_ERR_VALUE(current->mm->start_brk)) {
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 5ee13b2..2011126 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -19,6 +19,11 @@
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */
+#else
+# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
+#endif

#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
diff --git a/init/Kconfig b/init/Kconfig
index 54c655c..72e6d10 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1079,6 +1079,28 @@ config SLOB

endchoice

+config MMAP_ALLOW_UNINITIALIZED
+ bool "Allow mmaped anonymous memory to be un-initialized"
+ depends on EMBEDDED && !MMU
+ default n
+ help
+ Normally, and according to the Linux spec, anonymous memory obtained
+ from mmap() has it's contents cleared before it is passed to
+ userspace. Enabling this config option allows you to request that
+ mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
+ providing a huge performance boost. If this option is not enabled,
+ then the flag will be ignored.
+
+ This is taken advantage of by uClibc's malloc(), and also by
+ ELF-FDPIC binfmt's brk and stack allocator.
+
+ Because of the obvious security issues, this option should only be
+ enabled on embedded devices where you control what is run in
+ userspace. Since that isn't generally a problem on no-MMU systems,
+ it is normally safe to say Y here.
+
+ See Documentation/nommu-mmap.txt for more information.
+
config PROFILING
bool "Profiling support (EXPERIMENTAL)"
help
diff --git a/mm/nommu.c b/mm/nommu.c
index 0c0364f..1544a65 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1144,9 +1144,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
if (ret < rlen)
memset(base + ret, 0, rlen - ret);

- } else {
- /* if it's an anonymous mapping, then just clear it */
- memset(base, 0, rlen);
}

return 0;
@@ -1344,6 +1341,11 @@ unsigned long do_mmap_pgoff(struct file *file,
goto error_just_free;
add_nommu_region(region);

+ /* clear anonymous mappings that don't ask for uninitialized data */
+ if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ memset((void *)region->vm_start, 0,
+ region->vm_end - region->vm_start);
+
/* okay... we have a mapping; now we have to register it */
result = vma->vm_start;

2009-12-10 13:58:43

by David Howells

[permalink] [raw]
Subject: [PATCH 4/7] FDPIC: Respect PT_GNU_STACK exec protection markings when creating NOMMU stack

From: Mike Frysinger <[email protected]>

The current code will load the stack size and protection markings, but then
only use the markings in the MMU code path. The NOMMU code path always passes
PROT_EXEC to the mmap() call. While this doesn't matter to most people whilst
the code is running, it will cause a pointless icache flush when starting every
FDPIC application. Typically this icache flush will be of a region on the
order of 128KB in size, or may be the entire icache, depending on the
facilities available on the CPU.

In the case where the arch default behaviour seems to be desired
(EXSTACK_DEFAULT), we probe VM_STACK_FLAGS for VM_EXEC to determine whether we
should be setting PROT_EXEC or not.

For arches that support an MPU (Memory Protection Unit - an MMU without the
virtual mapping capability), setting PROT_EXEC or not will make an important
difference.

It should be noted that this change also affects the executability of the brk
region, since ELF-FDPIC has that share with the stack. However, this is
probably irrelevant as NOMMU programs aren't likely to use the brk region,
preferring instead allocation via mmap().

Signed-off-by: Mike Frysinger <[email protected]>
Signed-off-by: David Howells <[email protected]>
---

arch/blackfin/include/asm/page.h | 5 +++++
arch/frv/include/asm/page.h | 2 --
fs/binfmt_elf_fdpic.c | 13 +++++++++++--
3 files changed, 16 insertions(+), 4 deletions(-)


diff --git a/arch/blackfin/include/asm/page.h b/arch/blackfin/include/asm/page.h
index 944a07c..1d04e40 100644
--- a/arch/blackfin/include/asm/page.h
+++ b/arch/blackfin/include/asm/page.h
@@ -10,4 +10,9 @@
#include <asm-generic/page.h>
#define MAP_NR(addr) (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)

+#define VM_DATA_DEFAULT_FLAGS \
+ (VM_READ | VM_WRITE | \
+ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
#endif
diff --git a/arch/frv/include/asm/page.h b/arch/frv/include/asm/page.h
index 25c6a50..8c97068 100644
--- a/arch/frv/include/asm/page.h
+++ b/arch/frv/include/asm/page.h
@@ -63,12 +63,10 @@ extern unsigned long max_pfn;
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)


-#ifdef CONFIG_MMU
#define VM_DATA_DEFAULT_FLAGS \
(VM_READ | VM_WRITE | \
((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-#endif

#endif /* __ASSEMBLY__ */

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 79d2b1a..db85b42 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
#ifdef ELF_FDPIC_PLAT_INIT
unsigned long dynaddr;
#endif
+#ifndef CONFIG_MMU
+ unsigned long stack_prot;
+#endif
struct file *interpreter = NULL; /* to shut gcc up */
char *interpreter_name = NULL;
int executable_stack;
@@ -316,6 +319,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
* defunct, deceased, etc. after this point we have to exit via
* error_kill */
set_personality(PER_LINUX_FDPIC);
+ if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+ current->personality |= READ_IMPLIES_EXEC;
set_binfmt(&elf_fdpic_format);

current->mm->start_code = 0;
@@ -377,9 +382,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;

+ stack_prot = PROT_READ | PROT_WRITE;
+ if (executable_stack == EXSTACK_ENABLE_X ||
+ (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+ stack_prot |= PROT_EXEC;
+
down_write(&current->mm->mmap_sem);
- current->mm->start_brk = do_mmap(NULL, 0, stack_size,
- PROT_READ | PROT_WRITE | PROT_EXEC,
+ current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
MAP_PRIVATE | MAP_ANONYMOUS |
MAP_UNINITIALIZED | MAP_GROWSDOWN,
0);

2009-12-10 13:58:48

by David Howells

[permalink] [raw]
Subject: [PATCH 5/7] NOMMU: Avoiding duplicate icache flushes of shared maps

From: Mike Frysinger <[email protected]>

When working with FDPIC, there are many shared mappings of read-only code
regions between applications (the C library, applet packages like busybox,
etc.), but the current do_mmap_pgoff() function will issue an icache flush
whenever a VMA is added to an MM instead of only doing it when the map is
initially created.

The flush can instead be done when a region is first mmapped PROT_EXEC. Note
that we may not rely on the first mapping of a region being executable - it's
possible for it to be PROT_READ only, so we have to remember whether we've
flushed the region or not, and then flush the entire region when a bit of it is
made executable.

However, this also affects the brk area. That will no longer be executable.
We can mprotect() it to PROT_EXEC on MPU-mode kernels, but for NOMMU mode
kernels, when it increases the brk allocation, making sys_brk() flush the extra
from the icache should suffice. The brk area probably isn't used by NOMMU
programs since the brk area can only use up the leavings from the stack
allocation, where the stack allocation is larger than requested.

Signed-off-by: David Howells <[email protected]>
---

include/linux/mm_types.h | 2 ++
mm/nommu.c | 11 ++++++++---
2 files changed, 10 insertions(+), 3 deletions(-)


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 84a524a..84d020b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -123,6 +123,8 @@ struct vm_region {
struct file *vm_file; /* the backing file or NULL */

atomic_t vm_usage; /* region usage count */
+ bool vm_icache_flushed : 1; /* true if the icache has been flushed for
+ * this region */
};

/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 1544a65..bffbdfc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
+ flush_icache_range(mm->brk, brk);
return mm->brk = brk;
}

@@ -1354,10 +1355,14 @@ unsigned long do_mmap_pgoff(struct file *file,
share:
add_vma_to_mm(current->mm, vma);

- up_write(&nommu_region_sem);
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }

- if (prot & PROT_EXEC)
- flush_icache_range(result, result + len);
+ up_write(&nommu_region_sem);

kleave(" = %lx", result);
return result;

2009-12-10 13:58:57

by David Howells

[permalink] [raw]
Subject: [PATCH 6/7] NOMMU: Use copy_*_user_page() in access_process_vm()

From: Jie Zhang <[email protected]>

The MMU code uses the copy_*_user_page() variants in access_process_vm()
rather than copy_*_user() as the former includes an icache flush. This is
important when doing things like setting software breakpoints with gdb.
So switch the NOMMU code over to do the same.

This patch makes the reasonable assumption that copy_from_user_page() won't
fail - which is probably fine, as we've checked the VMA from which we're
copying is usable, and the copy is not allowed to cross VMAs. The one case
where it might go wrong is if the VMA is a device rather than RAM, and that
device returns an error which - in which case rubbish will be returned rather
than EIO.

Signed-off-by: Jie Zhang <[email protected]>
Signed-off-by: Mike Frysinger <[email protected]>
Signed-off-by: David Howells <[email protected]>
---

mm/nommu.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)


diff --git a/mm/nommu.c b/mm/nommu.c
index bffbdfc..a8c2392 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1897,9 +1897,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in

/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
- len -= copy_to_user((void *) addr, buf, len);
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
- len -= copy_from_user(buf, (void *) addr, len);
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
else
len = 0;
} else {

2009-12-10 13:59:05

by David Howells

[permalink] [raw]
Subject: [PATCH 7/7] NOMMU: ramfs: Drop unused local var

From: Mike Frysinger <[email protected]>

Signed-off-by: Mike Frysinger <[email protected]>
Signed-off-by: David Howells <[email protected]>
---

fs/ramfs/file-nommu.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)


diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 32fae40..2efc571 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -60,7 +60,7 @@ const struct inode_operations ramfs_file_inode_operations = {
*/
int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
{
- unsigned long npages, xpages, loop, limit;
+ unsigned long npages, xpages, loop;
struct page *pages;
unsigned order;
void *data;

2009-12-14 23:33:27

by Mike Frysinger

[permalink] [raw]
Subject: Re: [PATCH 2/7] NOMMU: Provide per-task stack usage through /proc for NOMMU

On Thu, Dec 10, 2009 at 08:58, David Howells wrote:
> Make it possible to get the per-task stack usage through /proc on a NOMMU
> system.  The MMU-mode routine can't be used because walk_page_range() doesn't
> work on NOMMU.
>
> It can be tested to show the stack usages of non-kernel-thread processes:
>
>        # grep "Stack usage:" /proc/*/status | grep -v "0 kB"
>        /proc/1/status:Stack usage:     2 kB
>        /proc/57/status:Stack usage:    3 kB
>        /proc/58/status:Stack usage:    1 kB
>        /proc/59/status:Stack usage:    3 kB
>        /proc/60/status:Stack usage:    5 kB
>        /proc/self/status:Stack usage:  1 kB
>
> I've only tested it with ELF-FDPIC, though it should work with FLAT too.

both seem to work for me. a simple FLAT shows 1kB while one with a
2kB buffer shows 3kB.

Signed-off-by: Mike Frysinger <[email protected]>
-mike

2009-12-14 23:52:19

by Mike Frysinger

[permalink] [raw]
Subject: Re: [PATCH 6/7] NOMMU: Use copy_*_user_page() in access_process_vm()

On Thu, Dec 10, 2009 at 08:58, David Howells wrote:
> From: Jie Zhang <[email protected]>
>
> The MMU code uses the copy_*_user_page() variants in access_process_vm()
> rather than copy_*_user() as the former includes an icache flush.  This is
> important when doing things like setting software breakpoints with gdb.
> So switch the NOMMU code over to do the same.
>
> This patch makes the reasonable assumption that copy_from_user_page() won't
> fail - which is probably fine, as we've checked the VMA from which we're
> copying is usable, and the copy is not allowed to cross VMAs.  The one case
> where it might go wrong is if the VMA is a device rather than RAM, and that
> device returns an error which - in which case rubbish will be returned rather
> than EIO.
>
> Signed-off-by: Jie Zhang <[email protected]>
> Signed-off-by: Mike Frysinger <[email protected]>
> Signed-off-by: David Howells <[email protected]>

btw, there are also:
Acked-by: David McCullough <[email protected]>
Acked-by: Paul Mundt <[email protected]>
Acked-by: Greg Ungerer <[email protected]>
-mike

2009-12-14 23:56:12

by Mike Frysinger

[permalink] [raw]
Subject: Re: [PATCH 5/7] NOMMU: Avoiding duplicate icache flushes of shared maps

On Thu, Dec 10, 2009 at 08:58, David Howells wrote:
> From: Mike Frysinger <[email protected]>
>
> When working with FDPIC, there are many shared mappings of read-only code
> regions between applications (the C library, applet packages like busybox,
> etc.), but the current do_mmap_pgoff() function will issue an icache flush
> whenever a VMA is added to an MM instead of only doing it when the map is
> initially created.
>
> The flush can instead be done when a region is first mmapped PROT_EXEC.  Note
> that we may not rely on the first mapping of a region being executable - it's
> possible for it to be PROT_READ only, so we have to remember whether we've
> flushed the region or not, and then flush the entire region when a bit of it is
> made executable.
>
> However, this also affects the brk area.  That will no longer be executable.
> We can mprotect() it to PROT_EXEC on MPU-mode kernels, but for NOMMU mode
> kernels, when it increases the brk allocation, making sys_brk() flush the extra
> from the icache should suffice.  The brk area probably isn't used by NOMMU
> programs since the brk area can only use up the leavings from the stack
> allocation, where the stack allocation is larger than requested.

this works fine for me, thanks.
Signed-off-by: Mike Frysinger <[email protected]>
-mike

2009-12-15 00:41:57

by Jamie Lokier

[permalink] [raw]
Subject: Re: [uClinux-dev] [PATCH 5/7] NOMMU: Avoiding duplicate icache flushes of shared maps

David Howells wrote:
> From: Mike Frysinger <[email protected]>
>
> When working with FDPIC, there are many shared mappings of read-only code
> regions between applications (the C library, applet packages like busybox,
> etc.), but the current do_mmap_pgoff() function will issue an icache flush
> whenever a VMA is added to an MM instead of only doing it when the map is
> initially created.
>
> @@ -1354,10 +1355,14 @@ unsigned long do_mmap_pgoff(struct file *file,
> share:
> add_vma_to_mm(current->mm, vma);
>
> - up_write(&nommu_region_sem);
> + /* we flush the region from the icache only when the first executable
> + * mapping of it is made */
> + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
> + flush_icache_range(region->vm_start, region->vm_end);
> + region->vm_icache_flushed = true;
> + }
>
> - if (prot & PROT_EXEC)
> - flush_icache_range(result, result + len);
> + up_write(&nommu_region_sem);
>
> kleave(" = %lx", result);
> return result;

This looks like it won't work in the following sequence:

process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
process B maps MAP_SHARED, PROT_READ|PROT_WRITE
and proceeds to modify the data
process C maps MAP_SHARED, PROT_READ|PROT_EXEC (no icache flush)

On a possibly related note:

What about icache flushes in these cases:

When using mprotect() PROT_READ|PROT_WRITE -> PROT_READ|PROT_EXEC,
e.g. as an FDPIC implementation may do when updating PLT entries.

And when calling msync(), like this:

process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
process B maps MAP_SHARED, PROT_READ|PROT_WRITE
and proceeds to modify the data
process A calls msync()
and proceeds to execute the modified contents

Do you think the mprotect() and msync() calls should flush icache in
those cases?

If seen arguments for it, and arguments that the executing process can
be expected to explicitly flush icache itself in those cases because
it knows what it is doing. (Personally I lean towards the kernel
should be doing it. IRIX interestingly offers both alternatives, with
a PROT_EXEC_NOFLUSH).

But in the first example above, I don't see how process C could be
expected to know it must flush icache, and process B could just be an
"optimised with writable mmap" file copy, so it shouldn't have
responsibility for icache either.

Or is icache fully flushed on every context switch on all nommu
architectures anyway, and defined to do so?

-- Jamie

2009-12-15 05:04:30

by Mike Frysinger

[permalink] [raw]
Subject: Re: [uClinux-dev] [PATCH 5/7] NOMMU: Avoiding duplicate icache flushes of shared maps

On Monday 14 December 2009 19:41:43 Jamie Lokier wrote:
> David Howells wrote:
> > From: Mike Frysinger <[email protected]>
> >
> > When working with FDPIC, there are many shared mappings of read-only code
> > regions between applications (the C library, applet packages like
> > busybox, etc.), but the current do_mmap_pgoff() function will issue an
> > icache flush whenever a VMA is added to an MM instead of only doing it
> > when the map is initially created.
> >
> > @@ -1354,10 +1355,14 @@ unsigned long do_mmap_pgoff(struct file *file,
> > share:
> > add_vma_to_mm(current->mm, vma);
> >
> > - up_write(&nommu_region_sem);
> > + /* we flush the region from the icache only when the first executable
> > + * mapping of it is made */
> > + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
> > + flush_icache_range(region->vm_start, region->vm_end);
> > + region->vm_icache_flushed = true;
> > + }
> >
> > - if (prot & PROT_EXEC)
> > - flush_icache_range(result, result + len);
> > + up_write(&nommu_region_sem);
> >
> > kleave(" = %lx", result);
> > return result;
>
> This looks like it won't work in the following sequence:
>
> process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
> process B maps MAP_SHARED, PROT_READ|PROT_WRITE
> and proceeds to modify the data
> process C maps MAP_SHARED, PROT_READ|PROT_EXEC (no icache flush)
>
> On a possibly related note:
>
> What about icache flushes in these cases:

David will have to respond here, but a test on my side shows that a mmap()
request on an existing r-xs mapping does not grant write access. you get back
a r-xs mapping.

> When using mprotect() PROT_READ|PROT_WRITE -> PROT_READ|PROT_EXEC,
> e.g. as an FDPIC implementation may do when updating PLT entries.

when would that happen ? PLT entries arent updated inline in the .text
section (if they were, you'd have TEXTRELs and the .text wouldnt be shared).
by definition, the function descriptor is stored in the GOT and that is what
gets updated by the resolver during lazy relocation.

> And when calling msync(), like this:
>
> process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
> process B maps MAP_SHARED, PROT_READ|PROT_WRITE
> and proceeds to modify the data
> process A calls msync()
> and proceeds to execute the modified contents
>
> Do you think the mprotect() and msync() calls should flush icache in
> those cases?

from what i can tell, sys_mprotect() and sys_msync() do not currently exist in
the nommu kernel port, and no one has complained so far ;).

uClibc has simply stubbed out these functions into inlines that always return
success. msync() is defined as pushing changes written to a file-backed
mapping back to disk, and i dont think this gets used under nommu.

> If seen arguments for it, and arguments that the executing process can
> be expected to explicitly flush icache itself in those cases because
> it knows what it is doing. (Personally I lean towards the kernel
> should be doing it. IRIX interestingly offers both alternatives, with
> a PROT_EXEC_NOFLUSH).
>
> But in the first example above, I don't see how process C could be
> expected to know it must flush icache, and process B could just be an
> "optimised with writable mmap" file copy, so it shouldn't have
> responsibility for icache either.

isnt this what the MS_INVALIDATE flag to msync() is for ?

> Or is icache fully flushed on every context switch on all nommu
> architectures anyway, and defined to do so?

ugh, no, this def does not occur, nor should it. the overhead here would be
crazy. on a normal FDPIC boot, the only icache flush called are the initial
creation of the shared .text maps in the C library and the applications. and
with a large busybox, you rarely see another one since the .text is shared.
-mike


Attachments:
signature.asc (836.00 B)
This is a digitally signed message part.

2009-12-15 10:53:14

by David Howells

[permalink] [raw]
Subject: Re: [uClinux-dev] [PATCH 5/7] NOMMU: Avoiding duplicate icache flushes of shared maps

Jamie Lokier <[email protected]> wrote:

> This looks like it won't work in the following sequence:
>
> process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
> process B maps MAP_SHARED, PROT_READ|PROT_WRITE
> and proceeds to modify the data
> process C maps MAP_SHARED, PROT_READ|PROT_EXEC (no icache flush)

Assuming all the above refer to the same piece of RAM, there's no reason that
process A will will continue to function correctly executing from the first
mapping if process B writes to that RAM through the second mapping.

There's also no point doing an icache flush unless you first flush the dcache
back to the RAM - and we don't know to do that because the O/S does not know
whether the RAM has been changed. So we'd have to do an unconditional dcache
flush too for the entire RAM segment.

I'd prefer to leave this to the writers. If they're mad enough to write
shared code that undergoes runtime modification, and then want to run it on
NOMMU...

So my question back to you is: would it work anyway?

Note that some arches have a specific cache flushing system call. Perhaps
this should be extended to all.

> What about icache flushes in these cases:
>
> When using mprotect() PROT_READ|PROT_WRITE -> PROT_READ|PROT_EXEC,
> e.g. as an FDPIC implementation may do when updating PLT entries.

There is no mprotect() on NOMMU, at least not at the moment. It may be
reasonable to add support for someone turning on/off the PROT_EXEC and
PROT_WRITE bits, and make it flush dcache to RAM when WRITE is turned off, and
flush the icache when EXEC is turned on, in that order.

However, as Mike said, we don't do this in FDPIC. The code sections are
immutable blobs, and are mapped MAP_PRIVATE, PROT_READ|PROT_EXEC from the
start. That way, mmap() will share them for us and even do XIP without special
support in userspace. FDPIC uses a non-executable GOT in the data area, and
loads the function pointer and new GOT pointer out of it before making a call.

> And when calling msync(), like this:
>
> process A maps MAP_SHARED, PROT_READ|PROT_EXEC (flushes icache)
> process B maps MAP_SHARED, PROT_READ|PROT_WRITE
> and proceeds to modify the data
> process A calls msync()
> and proceeds to execute the modified contents

Similarly, we don't provide msync(). On NOMMU, memory mappings cannot be
shared from disks that aren't based direct-access (quasi-)memory (e.g. ramfs,
MTD).

We could, perhaps, partially implement msync() to flush the appropriate caches.
We might even be able to add extra flags to msync() so that it can flush just
the CPU caches - that would obviate the need for separate syscalls for this
purpose.

> Do you think the mprotect() and msync() calls should flush icache in
> those cases?

I don't see that msync() should flush the icache at all. It's purpose is to
flush data to the backing store.

Also, don't forget that under NOMMU conditions, you have no idea if the data
has been modified.

> But in the first example above, I don't see how process C could be
> expected to know it must flush icache, and process B could just be an
> "optimised with writable mmap" file copy, so it shouldn't have
> responsibility for icache either.

It's manually executing off of a MAP_SHARED region, a region that others have
open for write. It has to look after its own semantics. This applied too to
process A.

> If seen arguments for it, and arguments that the executing process can
> be expected to explicitly flush icache itself in those cases because
> it knows what it is doing. (Personally I lean towards the kernel
> should be doing it. IRIX interestingly offers both alternatives, with
> a PROT_EXEC_NOFLUSH).

I disagree, at least in the case of MAP_SHARED regions. You need to manage
your own coherency. Again, see process A vs process B.

> Or is icache fully flushed on every context switch on all nommu
> architectures anyway, and defined to do so?

That would be a sure performance killer, and, in any case, wouldn't help on an
SMP system.

David