TASK_SIZE macro is quite deceptive: it looks like a constant but in fact
compiles to 50+ bytes.
Space savings on x86_64 defconfig:
add/remove: 1/0 grow/shrink: 3/24 up/down: 77/-2247 (-2170)
Function old new delta
_task_size - 52 +52
mpol_shared_policy_init 344 363 +19
shmem_get_unmapped_area 92 97 +5
__rseq_handle_notify_resume.cold 34 35 +1
copy_from_user_nmi 123 113 -10
mmap_address_hint_valid 92 56 -36
arch_get_unmapped_area_topdown 471 435 -36
tlb_gather_mmu 164 126 -38
hugetlb_get_unmapped_area 774 736 -38
__create_xol_area 497 458 -39
arch_tlb_gather_mmu 160 120 -40
setup_new_exec 380 336 -44
__x64_sys_mlockall 378 333 -45
__ia32_sys_mlockall 378 333 -45
tlb_flush_mmu 235 189 -46
unmap_page_range 2098 2048 -50
copy_mount_options 518 465 -53
__get_user_pages 1737 1675 -62
get_unmapped_area 270 204 -66
perf_prepare_sample 1176 1098 -78
perf_callchain_user 549 469 -80
mremap_to.isra 545 457 -88
arch_tlb_finish_mmu 394 305 -89
__do_munmap 1039 927 -112
elf_map 527 409 -118
prctl_set_mm 1509 1335 -174
__rseq_handle_notify_resume 1116 906 -210
load_elf_binary 11761 11111 -650
Total: Before=14121337, After=14119167, chg -0.02%
Signed-off-by: Alexey Dobriyan <[email protected]>
---
arch/x86/include/asm/processor.h | 4 ++--
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/task_size_64.c | 9 +++++++++
3 files changed, 12 insertions(+), 2 deletions(-)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -887,8 +887,8 @@ static inline void spin_lock_prefetch(const void *x)
#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
-#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
- IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+unsigned long _task_size(void);
+#define TASK_SIZE _task_size()
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,6 +46,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
obj-$(CONFIG_COMPAT) += signal_compat.o
+obj-$(CONFIG_X86_64) += task_size_64.o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
new file mode 100644
--- /dev/null
+++ b/arch/x86/kernel/task_size_64.c
@@ -0,0 +1,9 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/thread_info.h>
+
+unsigned long _task_size(void)
+{
+ return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET : TASK_SIZE_MAX;
+}
+EXPORT_SYMBOL(_task_size);
* Alexey Dobriyan <[email protected]> wrote:
> TASK_SIZE macro is quite deceptive: it looks like a constant but in fact
> compiles to 50+ bytes.
>
> Space savings on x86_64 defconfig:
>
> add/remove: 1/0 grow/shrink: 3/24 up/down: 77/-2247 (-2170)
> Function old new delta
> _task_size - 52 +52
> mpol_shared_policy_init 344 363 +19
> shmem_get_unmapped_area 92 97 +5
> __rseq_handle_notify_resume.cold 34 35 +1
> copy_from_user_nmi 123 113 -10
> mmap_address_hint_valid 92 56 -36
> arch_get_unmapped_area_topdown 471 435 -36
> tlb_gather_mmu 164 126 -38
> hugetlb_get_unmapped_area 774 736 -38
> __create_xol_area 497 458 -39
> arch_tlb_gather_mmu 160 120 -40
> setup_new_exec 380 336 -44
> __x64_sys_mlockall 378 333 -45
> __ia32_sys_mlockall 378 333 -45
> tlb_flush_mmu 235 189 -46
> unmap_page_range 2098 2048 -50
> copy_mount_options 518 465 -53
> __get_user_pages 1737 1675 -62
> get_unmapped_area 270 204 -66
> perf_prepare_sample 1176 1098 -78
> perf_callchain_user 549 469 -80
> mremap_to.isra 545 457 -88
> arch_tlb_finish_mmu 394 305 -89
> __do_munmap 1039 927 -112
> elf_map 527 409 -118
> prctl_set_mm 1509 1335 -174
> __rseq_handle_notify_resume 1116 906 -210
> load_elf_binary 11761 11111 -650
> Total: Before=14121337, After=14119167, chg -0.02%
>
> Signed-off-by: Alexey Dobriyan <[email protected]>
> ---
>
> arch/x86/include/asm/processor.h | 4 ++--
> arch/x86/kernel/Makefile | 1 +
> arch/x86/kernel/task_size_64.c | 9 +++++++++
> 3 files changed, 12 insertions(+), 2 deletions(-)
>
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -887,8 +887,8 @@ static inline void spin_lock_prefetch(const void *x)
>
> #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
> IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
> -#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
> - IA32_PAGE_OFFSET : TASK_SIZE_MAX)
> +unsigned long _task_size(void);
> +#define TASK_SIZE _task_size()
> #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
> IA32_PAGE_OFFSET : TASK_SIZE_MAX)
>
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -46,6 +46,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
>
> obj-y := process_$(BITS).o signal.o
> obj-$(CONFIG_COMPAT) += signal_compat.o
> +obj-$(CONFIG_X86_64) += task_size_64.o
> obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
> obj-y += time.o ioport.o dumpstack.o nmi.o
> obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
> new file mode 100644
> --- /dev/null
> +++ b/arch/x86/kernel/task_size_64.c
> @@ -0,0 +1,9 @@
> +#include <linux/export.h>
> +#include <linux/sched.h>
> +#include <linux/thread_info.h>
> +
> +unsigned long _task_size(void)
> +{
> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET : TASK_SIZE_MAX;
> +}
> +EXPORT_SYMBOL(_task_size);
Good idea - but instead of adding yet another compilation unit, why not
stick _task_size() into arch/x86/kernel/process_64.c, which is the
canonical place for process management related arch functions?
Thanks,
Ingo
On April 21, 2019 11:28:42 AM PDT, Ingo Molnar <[email protected]> wrote:
>
>* Alexey Dobriyan <[email protected]> wrote:
>
>> TASK_SIZE macro is quite deceptive: it looks like a constant but in
>fact
>> compiles to 50+ bytes.
>>
>> Space savings on x86_64 defconfig:
>>
>> add/remove: 1/0 grow/shrink: 3/24 up/down: 77/-2247 (-2170)
>> Function old new delta
>> _task_size - 52 +52
>> mpol_shared_policy_init 344 363 +19
>> shmem_get_unmapped_area 92 97 +5
>> __rseq_handle_notify_resume.cold 34 35 +1
>> copy_from_user_nmi 123 113 -10
>> mmap_address_hint_valid 92 56 -36
>> arch_get_unmapped_area_topdown 471 435 -36
>> tlb_gather_mmu 164 126 -38
>> hugetlb_get_unmapped_area 774 736 -38
>> __create_xol_area 497 458 -39
>> arch_tlb_gather_mmu 160 120 -40
>> setup_new_exec 380 336 -44
>> __x64_sys_mlockall 378 333 -45
>> __ia32_sys_mlockall 378 333 -45
>> tlb_flush_mmu 235 189 -46
>> unmap_page_range 2098 2048 -50
>> copy_mount_options 518 465 -53
>> __get_user_pages 1737 1675 -62
>> get_unmapped_area 270 204 -66
>> perf_prepare_sample 1176 1098 -78
>> perf_callchain_user 549 469 -80
>> mremap_to.isra 545 457 -88
>> arch_tlb_finish_mmu 394 305 -89
>> __do_munmap 1039 927 -112
>> elf_map 527 409 -118
>> prctl_set_mm 1509 1335 -174
>> __rseq_handle_notify_resume 1116 906 -210
>> load_elf_binary 11761 11111 -650
>> Total: Before=14121337, After=14119167, chg -0.02%
>>
>> Signed-off-by: Alexey Dobriyan <[email protected]>
>> ---
>>
>> arch/x86/include/asm/processor.h | 4 ++--
>> arch/x86/kernel/Makefile | 1 +
>> arch/x86/kernel/task_size_64.c | 9 +++++++++
>> 3 files changed, 12 insertions(+), 2 deletions(-)
>>
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -887,8 +887,8 @@ static inline void spin_lock_prefetch(const void
>*x)
>>
>> #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
>> IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
>> -#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
>> - IA32_PAGE_OFFSET : TASK_SIZE_MAX)
>> +unsigned long _task_size(void);
>> +#define TASK_SIZE _task_size()
>> #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child,
>TIF_ADDR32)) ? \
>> IA32_PAGE_OFFSET : TASK_SIZE_MAX)
>>
>> --- a/arch/x86/kernel/Makefile
>> +++ b/arch/x86/kernel/Makefile
>> @@ -46,6 +46,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
>>
>> obj-y := process_$(BITS).o signal.o
>> obj-$(CONFIG_COMPAT) += signal_compat.o
>> +obj-$(CONFIG_X86_64) += task_size_64.o
>> obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
>> obj-y += time.o ioport.o dumpstack.o nmi.o
>> obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
>> new file mode 100644
>> --- /dev/null
>> +++ b/arch/x86/kernel/task_size_64.c
>> @@ -0,0 +1,9 @@
>> +#include <linux/export.h>
>> +#include <linux/sched.h>
>> +#include <linux/thread_info.h>
>> +
>> +unsigned long _task_size(void)
>> +{
>> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
>TASK_SIZE_MAX;
>> +}
>> +EXPORT_SYMBOL(_task_size);
>
>Good idea - but instead of adding yet another compilation unit, why not
>
>stick _task_size() into arch/x86/kernel/process_64.c, which is the
>canonical place for process management related arch functions?
>
>Thanks,
>
> Ingo
Better yet... since TIF_ADDR32 isn't something that changes randomly, perhaps this should be a separate variable?
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Sun, Apr 21, 2019 at 01:07:08PM -0700, [email protected] wrote:
> On April 21, 2019 11:28:42 AM PDT, Ingo Molnar <[email protected]> wrote:
> >
> >* Alexey Dobriyan <[email protected]> wrote:
> >
> >> TASK_SIZE macro is quite deceptive: it looks like a constant but in
> >fact
> >> compiles to 50+ bytes.
> >>
> >> Space savings on x86_64 defconfig:
> >>
> >> add/remove: 1/0 grow/shrink: 3/24 up/down: 77/-2247 (-2170)
> >> Function old new delta
> >> _task_size - 52 +52
> >> mpol_shared_policy_init 344 363 +19
> >> shmem_get_unmapped_area 92 97 +5
> >> __rseq_handle_notify_resume.cold 34 35 +1
> >> copy_from_user_nmi 123 113 -10
> >> mmap_address_hint_valid 92 56 -36
> >> arch_get_unmapped_area_topdown 471 435 -36
> >> tlb_gather_mmu 164 126 -38
> >> hugetlb_get_unmapped_area 774 736 -38
> >> __create_xol_area 497 458 -39
> >> arch_tlb_gather_mmu 160 120 -40
> >> setup_new_exec 380 336 -44
> >> __x64_sys_mlockall 378 333 -45
> >> __ia32_sys_mlockall 378 333 -45
> >> tlb_flush_mmu 235 189 -46
> >> unmap_page_range 2098 2048 -50
> >> copy_mount_options 518 465 -53
> >> __get_user_pages 1737 1675 -62
> >> get_unmapped_area 270 204 -66
> >> perf_prepare_sample 1176 1098 -78
> >> perf_callchain_user 549 469 -80
> >> mremap_to.isra 545 457 -88
> >> arch_tlb_finish_mmu 394 305 -89
> >> __do_munmap 1039 927 -112
> >> elf_map 527 409 -118
> >> prctl_set_mm 1509 1335 -174
> >> __rseq_handle_notify_resume 1116 906 -210
> >> load_elf_binary 11761 11111 -650
> >> Total: Before=14121337, After=14119167, chg -0.02%
> >>
> >> Signed-off-by: Alexey Dobriyan <[email protected]>
> >> ---
> >>
> >> arch/x86/include/asm/processor.h | 4 ++--
> >> arch/x86/kernel/Makefile | 1 +
> >> arch/x86/kernel/task_size_64.c | 9 +++++++++
> >> 3 files changed, 12 insertions(+), 2 deletions(-)
> >>
> >> --- a/arch/x86/include/asm/processor.h
> >> +++ b/arch/x86/include/asm/processor.h
> >> @@ -887,8 +887,8 @@ static inline void spin_lock_prefetch(const void
> >*x)
> >>
> >> #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
> >> IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
> >> -#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
> >> - IA32_PAGE_OFFSET : TASK_SIZE_MAX)
> >> +unsigned long _task_size(void);
> >> +#define TASK_SIZE _task_size()
> >> #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child,
> >TIF_ADDR32)) ? \
> >> IA32_PAGE_OFFSET : TASK_SIZE_MAX)
> >>
> >> --- a/arch/x86/kernel/Makefile
> >> +++ b/arch/x86/kernel/Makefile
> >> @@ -46,6 +46,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
> >>
> >> obj-y := process_$(BITS).o signal.o
> >> obj-$(CONFIG_COMPAT) += signal_compat.o
> >> +obj-$(CONFIG_X86_64) += task_size_64.o
> >> obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
> >> obj-y += time.o ioport.o dumpstack.o nmi.o
> >> obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
> >> new file mode 100644
> >> --- /dev/null
> >> +++ b/arch/x86/kernel/task_size_64.c
> >> @@ -0,0 +1,9 @@
> >> +#include <linux/export.h>
> >> +#include <linux/sched.h>
> >> +#include <linux/thread_info.h>
> >> +
> >> +unsigned long _task_size(void)
> >> +{
> >> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
> >TASK_SIZE_MAX;
> >> +}
> >> +EXPORT_SYMBOL(_task_size);
> >
> >Good idea - but instead of adding yet another compilation unit, why not
> >
> >stick _task_size() into arch/x86/kernel/process_64.c, which is the
> >canonical place for process management related arch functions?
> >
> >Thanks,
> >
> > Ingo
>
> Better yet... since TIF_ADDR32 isn't something that changes randomly, perhaps this should be a separate variable?
Maybe. I only thought about putting every 32-bit related flag under
CONFIG_COMPAT to further eradicate bloat (and force everyone else
to keep an eye on it, ha-ha).
TASK_SIZE macro is quite deceptive: it looks like a constant but in fact
compiles to 50+ bytes.
Space savings on x86_64 defconfig:
add/remove: 1/0 grow/shrink: 3/24 up/down: 77/-2247 (-2170)
Function old new delta
_task_size - 52 +52
mpol_shared_policy_init 344 363 +19
shmem_get_unmapped_area 92 97 +5
__rseq_handle_notify_resume.cold 34 35 +1
copy_from_user_nmi 123 113 -10
mmap_address_hint_valid 92 56 -36
arch_get_unmapped_area_topdown 471 435 -36
tlb_gather_mmu 164 126 -38
hugetlb_get_unmapped_area 774 736 -38
__create_xol_area 497 458 -39
arch_tlb_gather_mmu 160 120 -40
setup_new_exec 380 336 -44
__x64_sys_mlockall 378 333 -45
__ia32_sys_mlockall 378 333 -45
tlb_flush_mmu 235 189 -46
unmap_page_range 2098 2048 -50
copy_mount_options 518 465 -53
__get_user_pages 1737 1675 -62
get_unmapped_area 270 204 -66
perf_prepare_sample 1176 1098 -78
perf_callchain_user 549 469 -80
mremap_to.isra 545 457 -88
arch_tlb_finish_mmu 394 305 -89
__do_munmap 1039 927 -112
elf_map 527 409 -118
prctl_set_mm 1509 1335 -174
__rseq_handle_notify_resume 1116 906 -210
load_elf_binary 11761 11111 -650 <===
Signed-off-by: Alexey Dobriyan <[email protected]>
---
arch/x86/include/asm/processor.h | 4 ++--
arch/x86/kernel/process_64.c | 6 ++++++
2 files changed, 8 insertions(+), 2 deletions(-)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -887,8 +887,8 @@ static inline void spin_lock_prefetch(const void *x)
#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
-#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
- IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+unsigned long _task_size(void);
+#define TASK_SIZE _task_size()
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -827,3 +827,9 @@ unsigned long KSTK_ESP(struct task_struct *task)
{
return task_pt_regs(task)->sp;
}
+
+unsigned long _task_size(void)
+{
+ return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET : TASK_SIZE_MAX;
+}
+EXPORT_SYMBOL(_task_size);
* Alexey Dobriyan <[email protected]> wrote:
> > >> +++ b/arch/x86/kernel/task_size_64.c
> > >> @@ -0,0 +1,9 @@
> > >> +#include <linux/export.h>
> > >> +#include <linux/sched.h>
> > >> +#include <linux/thread_info.h>
> > >> +
> > >> +unsigned long _task_size(void)
> > >> +{
> > >> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
> > >TASK_SIZE_MAX;
> > >> +}
> > >> +EXPORT_SYMBOL(_task_size);
> > >
> > >Good idea - but instead of adding yet another compilation unit, why not
> > >
> > >stick _task_size() into arch/x86/kernel/process_64.c, which is the
> > >canonical place for process management related arch functions?
> > >
> > >Thanks,
> > >
> > > Ingo
> >
> > Better yet... since TIF_ADDR32 isn't something that changes randomly,
> > perhaps this should be a separate variable?
>
> Maybe. I only thought about putting every 32-bit related flag under
> CONFIG_COMPAT to further eradicate bloat (and force everyone else to
> keep an eye on it, ha-ha).
Basically TIF_ADDR32 is only set for a task if set_personality_ia32() is
called, which function is called in the following circumstances:
- arch/x86/ia32/ia32_aout.c:load_aout_binary()
This is in exec(), when a new binary is loaded for the current task,
via search_binary_handler() and exec_binprm(). Ordering is
synchronous, AFAICS there can be no race between TASK_SIZE users and
the set_personality_ia32() call which is always for the current task.
- in COMPAT_SET_PERSONALITY(), which through macro detours ends up being
in SET_PERSONALITY2(), which is used in fs/compat_binfmt_elf.c's
load_elf_binary(), used in a similar fashion in exec() as the AOUT
case above. One particular macro detour of note is that
fs/compat_binfmt_elf.c #includes fs/binfmt_elf.c and re-defines the
personality setting method to map to set_personality_ia32().
When set_personality_ia32() is called then TIF_ADDR32 is set
unconditionally, without any Kconfig variations.
TIF_ADDR32 is cleared:
- In set_personality_64bit(), when a 64-bit binary is loaded via
fs/binfmt_elf.c.
- It also defaults to clear in the init task, which is inherited by the
initial kernel threads and any user-space task they might end up
executing.
So the conclusion is that IMO we can safely put TASK_SIZE into a new
thread_info()->task_size field, and:
- change ->task_size to the 32-bit address space in
set_personality_ia32()
- change ->task_size to teh 64-bit address space in the init task and in
set_personality_64bit().
This should cover it I think, unless I missed something.
Thanks,
Ingo
> On Apr 22, 2019, at 3:34 AM, Ingo Molnar <[email protected]> wrote:
>
>
> * Alexey Dobriyan <[email protected]> wrote:
>
>>>>> +++ b/arch/x86/kernel/task_size_64.c
>>>>> @@ -0,0 +1,9 @@
>>>>> +#include <linux/export.h>
>>>>> +#include <linux/sched.h>
>>>>> +#include <linux/thread_info.h>
>>>>> +
>>>>> +unsigned long _task_size(void)
>>>>> +{
>>>>> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
>>>> TASK_SIZE_MAX;
>>>>> +}
>>>>> +EXPORT_SYMBOL(_task_size);
>>>>
>>>> Good idea - but instead of adding yet another compilation unit, why not
>>>>
>>>> stick _task_size() into arch/x86/kernel/process_64.c, which is the
>>>> canonical place for process management related arch functions?
>>>>
>>>> Thanks,
>>>>
>>>> Ingo
>>>
>>> Better yet... since TIF_ADDR32 isn't something that changes randomly,
>>> perhaps this should be a separate variable?
>>
>> Maybe. I only thought about putting every 32-bit related flag under
>> CONFIG_COMPAT to further eradicate bloat (and force everyone else to
>> keep an eye on it, ha-ha).
>
> Basically TIF_ADDR32 is only set for a task if set_personality_ia32() is
> called, which function is called in the following circumstances:
>
> - arch/x86/ia32/ia32_aout.c:load_aout_binary()
>
> This is in exec(), when a new binary is loaded for the current task,
> via search_binary_handler() and exec_binprm(). Ordering is
> synchronous, AFAICS there can be no race between TASK_SIZE users and
> the set_personality_ia32() call which is always for the current task.
>
> - in COMPAT_SET_PERSONALITY(), which through macro detours ends up being
> in SET_PERSONALITY2(), which is used in fs/compat_binfmt_elf.c's
> load_elf_binary(), used in a similar fashion in exec() as the AOUT
> case above. One particular macro detour of note is that
> fs/compat_binfmt_elf.c #includes fs/binfmt_elf.c and re-defines the
> personality setting method to map to set_personality_ia32().
>
> When set_personality_ia32() is called then TIF_ADDR32 is set
> unconditionally, without any Kconfig variations.
>
> TIF_ADDR32 is cleared:
>
> - In set_personality_64bit(), when a 64-bit binary is loaded via
> fs/binfmt_elf.c.
>
> - It also defaults to clear in the init task, which is inherited by the
> initial kernel threads and any user-space task they might end up
> executing.
>
> So the conclusion is that IMO we can safely put TASK_SIZE into a new
> thread_info()->task_size field, and:
>
> - change ->task_size to the 32-bit address space in
> set_personality_ia32()
>
> - change ->task_size to teh 64-bit address space in the init task and in
> set_personality_64bit().
>
> This should cover it I think, unless I missed something.
>
Are there really enough TASK_SIZE users to justify any of this?
On Mon, Apr 22, 2019 at 12:34:49PM +0200, Ingo Molnar wrote:
> When set_personality_ia32() is called then TIF_ADDR32 is set
> unconditionally, without any Kconfig variations.
Indeed.
personality(PER_LINUX32) = 0 (PER_LINUX)
I only wasted about half an evening ifdefing TIF_ flags.
Thanks for saving a lot of time!
On Mon, Apr 22, 2019 at 07:30:40AM -0700, Andy Lutomirski wrote:
>
>
> > On Apr 22, 2019, at 3:34 AM, Ingo Molnar <[email protected]> wrote:
> >
> >
> > * Alexey Dobriyan <[email protected]> wrote:
> >
> >>>>> +++ b/arch/x86/kernel/task_size_64.c
> >>>>> @@ -0,0 +1,9 @@
> >>>>> +#include <linux/export.h>
> >>>>> +#include <linux/sched.h>
> >>>>> +#include <linux/thread_info.h>
> >>>>> +
> >>>>> +unsigned long _task_size(void)
> >>>>> +{
> >>>>> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
> >>>> TASK_SIZE_MAX;
> >>>>> +}
> >>>>> +EXPORT_SYMBOL(_task_size);
> >>>>
> >>>> Good idea - but instead of adding yet another compilation unit, why not
> >>>>
> >>>> stick _task_size() into arch/x86/kernel/process_64.c, which is the
> >>>> canonical place for process management related arch functions?
> >>>>
> >>>> Thanks,
> >>>>
> >>>> Ingo
> >>>
> >>> Better yet... since TIF_ADDR32 isn't something that changes randomly,
> >>> perhaps this should be a separate variable?
> >>
> >> Maybe. I only thought about putting every 32-bit related flag under
> >> CONFIG_COMPAT to further eradicate bloat (and force everyone else to
> >> keep an eye on it, ha-ha).
> >
> > Basically TIF_ADDR32 is only set for a task if set_personality_ia32() is
> > called, which function is called in the following circumstances:
> >
> > - arch/x86/ia32/ia32_aout.c:load_aout_binary()
> >
> > This is in exec(), when a new binary is loaded for the current task,
> > via search_binary_handler() and exec_binprm(). Ordering is
> > synchronous, AFAICS there can be no race between TASK_SIZE users and
> > the set_personality_ia32() call which is always for the current task.
> >
> > - in COMPAT_SET_PERSONALITY(), which through macro detours ends up being
> > in SET_PERSONALITY2(), which is used in fs/compat_binfmt_elf.c's
> > load_elf_binary(), used in a similar fashion in exec() as the AOUT
> > case above. One particular macro detour of note is that
> > fs/compat_binfmt_elf.c #includes fs/binfmt_elf.c and re-defines the
> > personality setting method to map to set_personality_ia32().
> >
> > When set_personality_ia32() is called then TIF_ADDR32 is set
> > unconditionally, without any Kconfig variations.
> >
> > TIF_ADDR32 is cleared:
> >
> > - In set_personality_64bit(), when a 64-bit binary is loaded via
> > fs/binfmt_elf.c.
> >
> > - It also defaults to clear in the init task, which is inherited by the
> > initial kernel threads and any user-space task they might end up
> > executing.
> >
> > So the conclusion is that IMO we can safely put TASK_SIZE into a new
> > thread_info()->task_size field, and:
> >
> > - change ->task_size to the 32-bit address space in
> > set_personality_ia32()
> >
> > - change ->task_size to teh 64-bit address space in the init task and in
> > set_personality_64bit().
> >
> > This should cover it I think, unless I missed something.
> >
>
> Are there really enough TASK_SIZE users to justify any of this?
Saving 2KB on a defconfig is quite a lot.
If put into thread_info, ->task_size can be pulled using just RAX which
in turn allows to do
asm volatile "call %P" ... "=a" (...)
saving even more space.
But it is late here so don't quote me.
On Sun, Apr 21, 2019 at 9:06 AM Alexey Dobriyan <[email protected]> wrote:
>
> TASK_SIZE macro is quite deceptive: it looks like a constant but in fact
> compiles to 50+ bytes.
Honestly, if you are interested in improving TASK_SIZE, I'd really
like to see you try to go even further than this.
TASK_SIZE _used_ to just be a fixed constant, which is why it has that
name and why the usage patterns are what they are.
But since that isn't true any more, I'd much rather fix the _name_,
and I'd much rather fix the nasty complex hidden behavior, rather than
just keep the name and keep the behavior, but turning it from an
inline macro to a function call.
And as Ingo points out, we should be able to just make it a field of
its own, instead of that complex dance of TIF_ADDR32 etc.
However, I think it would be better if that field would be in "struct
mm_struct" instead of Ingo's suggestion of the thread. Because while
it's currently a per-thread flag, I think it is only set by execve(),
so it always ends up being the same per-mm. No?
Also, we could/should just make the existing *users* of TASK_SIZE know
that it's no longer a simple constant, so all those functions that use
it many times could just do
unsigned long task_size = TASK_SIZE;
rather than re-compute it multiple times like they do now.
In fact, making it a function call in many ways makes things *worse*,
although maybe we could at least mark the function "pure" so that gcc
would be able to cache the end result. But that would actually be
wrong for the sequences that maybe do change the thread flags, so I
hate that idea too.
Much better to just cache it explicitly in the cases where we see that
it's currently generating bad code.
Linus
On Mon, Apr 22, 2019 at 3:09 PM Alexey Dobriyan <[email protected]> wrote:
>
> On Mon, Apr 22, 2019 at 07:30:40AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Apr 22, 2019, at 3:34 AM, Ingo Molnar <[email protected]> wrote:
> > >
> > >
> > > * Alexey Dobriyan <[email protected]> wrote:
> > >
> > >>>>> +++ b/arch/x86/kernel/task_size_64.c
> > >>>>> @@ -0,0 +1,9 @@
> > >>>>> +#include <linux/export.h>
> > >>>>> +#include <linux/sched.h>
> > >>>>> +#include <linux/thread_info.h>
> > >>>>> +
> > >>>>> +unsigned long _task_size(void)
> > >>>>> +{
> > >>>>> + return test_thread_flag(TIF_ADDR32) ? IA32_PAGE_OFFSET :
> > >>>> TASK_SIZE_MAX;
> > >>>>> +}
> > >>>>> +EXPORT_SYMBOL(_task_size);
> > >>>>
> > >>>> Good idea - but instead of adding yet another compilation unit, why not
> > >>>>
> > >>>> stick _task_size() into arch/x86/kernel/process_64.c, which is the
> > >>>> canonical place for process management related arch functions?
> > >>>>
> > >>>> Thanks,
> > >>>>
> > >>>> Ingo
> > >>>
> > >>> Better yet... since TIF_ADDR32 isn't something that changes randomly,
> > >>> perhaps this should be a separate variable?
> > >>
> > >> Maybe. I only thought about putting every 32-bit related flag under
> > >> CONFIG_COMPAT to further eradicate bloat (and force everyone else to
> > >> keep an eye on it, ha-ha).
> > >
> > > Basically TIF_ADDR32 is only set for a task if set_personality_ia32() is
> > > called, which function is called in the following circumstances:
> > >
> > > - arch/x86/ia32/ia32_aout.c:load_aout_binary()
> > >
> > > This is in exec(), when a new binary is loaded for the current task,
> > > via search_binary_handler() and exec_binprm(). Ordering is
> > > synchronous, AFAICS there can be no race between TASK_SIZE users and
> > > the set_personality_ia32() call which is always for the current task.
> > >
> > > - in COMPAT_SET_PERSONALITY(), which through macro detours ends up being
> > > in SET_PERSONALITY2(), which is used in fs/compat_binfmt_elf.c's
> > > load_elf_binary(), used in a similar fashion in exec() as the AOUT
> > > case above. One particular macro detour of note is that
> > > fs/compat_binfmt_elf.c #includes fs/binfmt_elf.c and re-defines the
> > > personality setting method to map to set_personality_ia32().
> > >
> > > When set_personality_ia32() is called then TIF_ADDR32 is set
> > > unconditionally, without any Kconfig variations.
> > >
> > > TIF_ADDR32 is cleared:
> > >
> > > - In set_personality_64bit(), when a 64-bit binary is loaded via
> > > fs/binfmt_elf.c.
> > >
> > > - It also defaults to clear in the init task, which is inherited by the
> > > initial kernel threads and any user-space task they might end up
> > > executing.
> > >
> > > So the conclusion is that IMO we can safely put TASK_SIZE into a new
> > > thread_info()->task_size field, and:
> > >
> > > - change ->task_size to the 32-bit address space in
> > > set_personality_ia32()
> > >
> > > - change ->task_size to teh 64-bit address space in the init task and in
> > > set_personality_64bit().
> > >
> > > This should cover it I think, unless I missed something.
> > >
> >
> > Are there really enough TASK_SIZE users to justify any of this?
>
> Saving 2KB on a defconfig is quite a lot.
Saving 2kB of text by adding 8 bytes to thread_info seems rather
dubious to me. You only need 256 tasks before you lose. My
not-particularly-loaded laptop has 865 tasks right now.
As a general principle, the mere existence of TIF_ADDR32 is a bug.
The value of that flag is *wrong* under the 32-bit variant of CRIU.
How about instead making some more progress toward getting rid of
dubious TASK_SIZE users? I'm working on a little series to get rid of
most of them. Meanwhile: it sure looks like a large fraction of the
users are confused as to whether TASK_SIZE is the highest user address
or the lowest non-user address.
* Andy Lutomirski <[email protected]> wrote:
> > Saving 2KB on a defconfig is quite a lot.
>
> Saving 2kB of text by adding 8 bytes to thread_info seems rather
> dubious to me. You only need 256 tasks before you lose. My
> not-particularly-loaded laptop has 865 tasks right now.
I was suggesting current->task_size or thread_info->task_size as a way to
100% avoid the function call overhead. Worth a tiny amount of RAM - even
with 1 million tasks it's only 4MB of RAM. ;-)
Some TASK_SIZE users are prominent syscalls: mmap(),
> As a general principle, the mere existence of TIF_ADDR32 is a bug. The
> value of that flag is *wrong* under the 32-bit variant of CRIU. How
> about instead making some more progress toward getting rid of dubious
> TASK_SIZE users? I'm working on a little series to get rid of most of
> them. Meanwhile: it sure looks like a large fraction of the users are
> confused as to whether TASK_SIZE is the highest user address or the
> lowest non-user address.
I really like that, replacing TASK_SIZE with *nothing* would be even
faster.
In fact instead of just reducing its usage I'd suggest removing TASK_SIZE
altogether and renaming TASK_SIZE_MAX back to TASK_SIZE, or something
like that - the confusion from the deceptively macro-ish naming of
TASK_SIZE is real.
The original commit of making TASK_SIZE dynamic on the task's compat flag
was done in:
84929801e14d: [PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes
Here's the justification given:
Appended patch will setup compatibility mode TASK_SIZE properly. This will
fix atleast three known bugs that can be encountered while running
compatibility mode apps.
a) A malicious 32bit app can have an elf section at 0xffffe000. During
exec of this app, we will have a memory leak as insert_vm_struct() is
not checking for return value in syscall32_setup_pages() and thus not
freeing the vma allocated for the vsyscall page. And instead of exec
failing (as it has addresses > TASK_SIZE), we were allowing it to
succeed previously.
b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
may return addresses beyond 32bits, ultimately causing corruption
because of wrap-around and resulting in SEGFAULT, instead of returning
ENOMEM.
c) 32bit app doing this below mmap will now fail.
mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);
I believe a) is addressed by getting rid of the vsyscall page - but it
might also not be a current problem because the vsyscall page has its own
gate-vma now.
b) shouldn't be an issue if the mmap allocator correctly treats the
compat bit - this doesn't require generic TASK_SIZE variations though, as
the mmap allocator is already specific to arch/x86/.
c) is a variant of a) I believe, which should be fixed by now.
I just looked through some of the current TASK_SIZE users, and *ALL* of
them seem dubious to me, with the exception of the mmap allocators. In
fact some of them seem to be active bugs:
kernel/:
- PR_SET_MM_MAP, PR_SET_MM_MAP_SIZE, prctl_set_mm() et al. Ugh, what a
nasty prctl()! But besides that, the TASK_SIZE restriction to the ABI
looks questionable: if we offer this CRIU functionality then why
should it be impossible for a 32-bit CRIU task to switch to 64-bit?
- kernel/events/core.c: TASK_SIZE_MAX should be a fine filter here, in
fact it's probably *wrong* to restrict the profiling data here just
because the task happens to be in 32-bit compat mode.
- kernel/rseq.c: is this TASK_SIZE restriction even required, wouldn't
TASK_SIZE_MAX be sufficient?
mm/:
- GUP's get_get_area() et al looks really weird - why do we special-case
vsyscalls:
- Can we get rid of the vsyscall page in modern kernels?
- I don't think anyone runs those ancient glibc versions with a
fresh kernel anymore - can we start generating a WARN()ing
perhaps to see whether there's any complaints?
- Or at least pretend it doesn't exist in terms of a GUP target page?
- mm/kasan/generic_report.c:get_wild_bug_type() - this can use
TASK_SIZE_MAX just fine IMHO.
- mm/mempolicy.c:mpol_shared_policy_init() - unsure, but I suspect we
can just create the pseudo-vma with a TASK_SIZE_MAX vm_end just fine.
- mm/mlock.c:mlockall() - I believe it could be considered an outright
*bug* if there any pages outside the 32-bit area and don't get mlocked
by mlockall, just because this is a compat task. Especially with the
CRIU prctl() having 64-bit vmas outside the 32-bit mappings is a real
possibility, right? I.e. TASK_SIZE_MAX would be the right solution
here.
To turn the argument around: beyond the memory allocators, which includes
the mmap and huge-mmap variants plus the SysV shmem allocator, can we
list all the places that absolutely *rely* on TASK_SIZE being TIF_ADDR32
restricted on compat tasks? I couldn't find any.
So I concur 100% that most TASK_SIZE uses are questionable. In fact think
84929801e14d was a mistake, and we should effectively revert it
carefully, by:
- First by moving almost all TASK_SIZE users over to TASK_SIZE_MAX,
analyzing and justifying the impact case by case.
- Then making the mmap allocators compat compatible (ha) without relying
on TASK_SIZE.
- Renaming TASK_SIZE back to TASK_SIZE_MAX and getting rid of the
TASK_SIZE and TASK_SIZE_MAX differentiation.
Or am I missing some complication?
Thanks,
Ingo
On Tue, Apr 23, 2019 at 4:13 AM Ingo Molnar <[email protected]> wrote:
>
>
> * Andy Lutomirski <[email protected]> wrote:
>
> > > Saving 2KB on a defconfig is quite a lot.
> >
> > Saving 2kB of text by adding 8 bytes to thread_info seems rather
> > dubious to me. You only need 256 tasks before you lose. My
> > not-particularly-loaded laptop has 865 tasks right now.
>
> I was suggesting current->task_size or thread_info->task_size as a way to
> 100% avoid the function call overhead. Worth a tiny amount of RAM - even
> with 1 million tasks it's only 4MB of RAM. ;-)
>
> Some TASK_SIZE users are prominent syscalls: mmap(),
>
> > As a general principle, the mere existence of TIF_ADDR32 is a bug. The
> > value of that flag is *wrong* under the 32-bit variant of CRIU. How
> > about instead making some more progress toward getting rid of dubious
> > TASK_SIZE users? I'm working on a little series to get rid of most of
> > them. Meanwhile: it sure looks like a large fraction of the users are
> > confused as to whether TASK_SIZE is the highest user address or the
> > lowest non-user address.
>
> I really like that, replacing TASK_SIZE with *nothing* would be even
> faster.
>
> In fact instead of just reducing its usage I'd suggest removing TASK_SIZE
> altogether and renaming TASK_SIZE_MAX back to TASK_SIZE, or something
> like that - the confusion from the deceptively macro-ish naming of
> TASK_SIZE is real.
>
> The original commit of making TASK_SIZE dynamic on the task's compat flag
> was done in:
>
> 84929801e14d: [PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes
>
> Here's the justification given:
>
> Appended patch will setup compatibility mode TASK_SIZE properly. This will
> fix atleast three known bugs that can be encountered while running
> compatibility mode apps.
>
> a) A malicious 32bit app can have an elf section at 0xffffe000. During
> exec of this app, we will have a memory leak as insert_vm_struct() is
> not checking for return value in syscall32_setup_pages() and thus not
> freeing the vma allocated for the vsyscall page. And instead of exec
> failing (as it has addresses > TASK_SIZE), we were allowing it to
> succeed previously.
>
> b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
> may return addresses beyond 32bits, ultimately causing corruption
> because of wrap-around and resulting in SEGFAULT, instead of returning
> ENOMEM.
>
> c) 32bit app doing this below mmap will now fail.
>
> mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
> MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);
>
> I believe a) is addressed by getting rid of the vsyscall page - but it
> might also not be a current problem because the vsyscall page has its own
> gate-vma now.
>
I suspect that whatever issue this is predates my involvement in Linux
:) The "gate" VMA, aka vsyscall, is at 0xffffffffff600000, and the
vDSO setup code shouldn't anywhere near that fragile. Also, if this
really a bug, we have it for the 64-bit case, too, and TASK_SIZE isn't
going to help.
> b) shouldn't be an issue if the mmap allocator correctly treats the
> compat bit - this doesn't require generic TASK_SIZE variations though, as
> the mmap allocator is already specific to arch/x86/.
This was mostly fixed by the CRIU folks a while back, I think.
>
> c) is a variant of a) I believe, which should be fixed by now.
>
> I just looked through some of the current TASK_SIZE users, and *ALL* of
> them seem dubious to me, with the exception of the mmap allocators. In
Even the munmap one seems to be a bug.
> fact some of them seem to be active bugs:
>
> kernel/:
>
> - PR_SET_MM_MAP, PR_SET_MM_MAP_SIZE, prctl_set_mm() et al. Ugh, what a
> nasty prctl()! But besides that, the TASK_SIZE restriction to the ABI
> looks questionable: if we offer this CRIU functionality then why
> should it be impossible for a 32-bit CRIU task to switch to 64-bit?
>
> - kernel/events/core.c: TASK_SIZE_MAX should be a fine filter here, in
> fact it's probably *wrong* to restrict the profiling data here just
> because the task happens to be in 32-bit compat mode.
Yep. I have a patch fo rthis.
>
> - kernel/rseq.c: is this TASK_SIZE restriction even required, wouldn't
> TASK_SIZE_MAX be sufficient?
Presumably.
> So I concur 100% that most TASK_SIZE uses are questionable. In fact think
> 84929801e14d was a mistake, and we should effectively revert it
> carefully, by:
>
> - First by moving almost all TASK_SIZE users over to TASK_SIZE_MAX,
> analyzing and justifying the impact case by case.
>
> - Then making the mmap allocators compat compatible (ha) without relying
> on TASK_SIZE.
I have a somewhat hacky patch for this right now.
>
> - Renaming TASK_SIZE back to TASK_SIZE_MAX and getting rid of the
> TASK_SIZE and TASK_SIZE_MAX differentiation.
>
> Or am I missing some complication?
Seems like a great idea to me.
BTW, what the heck is up with get_gate_page()? I'm struggling to
understand what it's even trying to do. If there's an architecture
that allows a user program to mremap() or otherwise position its gate
VMA between TASK_SIZE and TASK_SIZE_MAX, then that code is going to
explode horribly.
A whole bunch of work in this direction is here:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/fixes
It's almost entirely untested.
* Andy Lutomirski <[email protected]> wrote:
> > Or am I missing some complication?
>
> Seems like a great idea to me.
>
> BTW, what the heck is up with get_gate_page()? I'm struggling to
> understand what it's even trying to do. If there's an architecture
> that allows a user program to mremap() or otherwise position its gate
> VMA between TASK_SIZE and TASK_SIZE_MAX, then that code is going to
> explode horribly.
I believe it was an old attempt from the times when the vsyscall area
*didn't* have a vma, at all, and only get_gate_page() kept the mmap
allocator from overlapping it with a user vma?
Should IMHO be entirely solved by the vma-ification of all things
vsyscall and vdso, and we can remove this remnant.
> A whole bunch of work in this direction is here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/fixes
>
> It's almost entirely untested.
Please post it as patches once you are somewhat confident in the outcome
and general direction.
Thanks,
Ingo