This defines and exports a platform specific custom vm_get_page_prot() via
subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
macros can be dropped which are no longer needed.
Cc: Russell King <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Anshuman Khandual <[email protected]>
---
arch/arm/Kconfig | 1 +
arch/arm/include/asm/pgtable.h | 20 +-------------
arch/arm/lib/uaccess_with_memcpy.c | 2 +-
arch/arm/mm/mmu.c | 44 ++++++++++++++++++++++++++----
4 files changed, 41 insertions(+), 26 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4c97cb40eebb..87b2e89ef3d6 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU
select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ select ARCH_HAS_VM_GET_PAGE_PROT
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index cd1f84bb40ae..64711716cd84 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -70,7 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t);
#endif
/*
- * The pgprot_* and protection_map entries will be fixed up in runtime
+ * The pgprot_* entries will be fixed up in runtime in vm_get_page_prot()
* to include the cachable and bufferable bits based on memory policy,
* as well as any architecture dependent bits like global/ASID and SMP
* shared mapping bits.
@@ -137,24 +137,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
* 2) If we could do execute protection, then read is implied
* 3) write implies read permissions
*/
-#define __P000 __PAGE_NONE
-#define __P001 __PAGE_READONLY
-#define __P010 __PAGE_COPY
-#define __P011 __PAGE_COPY
-#define __P100 __PAGE_READONLY_EXEC
-#define __P101 __PAGE_READONLY_EXEC
-#define __P110 __PAGE_COPY_EXEC
-#define __P111 __PAGE_COPY_EXEC
-
-#define __S000 __PAGE_NONE
-#define __S001 __PAGE_READONLY
-#define __S010 __PAGE_SHARED
-#define __S011 __PAGE_SHARED
-#define __S100 __PAGE_READONLY_EXEC
-#define __S101 __PAGE_READONLY_EXEC
-#define __S110 __PAGE_SHARED_EXEC
-#define __S111 __PAGE_SHARED_EXEC
-
#ifndef __ASSEMBLY__
/*
* ZERO_PAGE is a global shared page that is always zero: used
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 106f83a5ea6d..12d8d9794a28 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -247,7 +247,7 @@ static int __init test_size_treshold(void)
if (!dst_page)
goto no_dst;
kernel_ptr = page_address(src_page);
- user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
+ user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__PAGE_COPY));
if (!user_ptr)
goto no_vmap;
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 274e4f73fd33..9cdf45da57de 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -403,6 +403,8 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
}
+static pteval_t user_pgprot;
+
/*
* Adjust the PMD section entries according to the CPU in use.
*/
@@ -410,7 +412,7 @@ static void __init build_mem_type_table(void)
{
struct cachepolicy *cp;
unsigned int cr = get_cr();
- pteval_t user_pgprot, kern_pgprot, vecs_pgprot;
+ pteval_t kern_pgprot, vecs_pgprot;
int cpu_arch = cpu_architecture();
int i;
@@ -627,11 +629,6 @@ static void __init build_mem_type_table(void)
user_pgprot |= PTE_EXT_PXN;
#endif
- for (i = 0; i < 16; i++) {
- pteval_t v = pgprot_val(protection_map[i]);
- protection_map[i] = __pgprot(v | user_pgprot);
- }
-
mem_types[MT_LOW_VECTORS].prot_pte |= vecs_pgprot;
mem_types[MT_HIGH_VECTORS].prot_pte |= vecs_pgprot;
@@ -670,6 +667,41 @@ static void __init build_mem_type_table(void)
}
}
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ switch (vm_flags & (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)) {
+ case VM_NONE:
+ return __pgprot(pgprot_val(__PAGE_NONE) | user_pgprot);
+ case VM_READ:
+ return __pgprot(pgprot_val(__PAGE_READONLY) | user_pgprot);
+ case VM_WRITE:
+ case VM_WRITE | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_COPY) | user_pgprot);
+ case VM_EXEC:
+ case VM_EXEC | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_READONLY_EXEC) | user_pgprot);
+ case VM_EXEC | VM_WRITE:
+ case VM_EXEC | VM_WRITE | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_COPY_EXEC) | user_pgprot);
+ case VM_SHARED:
+ return __pgprot(pgprot_val(__PAGE_NONE) | user_pgprot);
+ case VM_SHARED | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_READONLY) | user_pgprot);
+ case VM_SHARED | VM_WRITE:
+ case VM_SHARED | VM_WRITE | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_SHARED) | user_pgprot);
+ case VM_SHARED | VM_EXEC:
+ case VM_SHARED | VM_EXEC | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_READONLY_EXEC) | user_pgprot);
+ case VM_SHARED | VM_EXEC | VM_WRITE:
+ case VM_SHARED | VM_EXEC | VM_WRITE | VM_READ:
+ return __pgprot(pgprot_val(__PAGE_SHARED_EXEC) | user_pgprot);
+ default:
+ BUILD_BUG();
+ }
+}
+EXPORT_SYMBOL(vm_get_page_prot);
+
#ifdef CONFIG_ARM_DMA_MEM_BUFFERABLE
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
--
2.25.1
On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
> > On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
> >> This defines and exports a platform specific custom vm_get_page_prot() via
> >> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
> >> macros can be dropped which are no longer needed.
> >
> > What I would really like to know is why having to run _code_ to work out
> > what the page protections need to be is better than looking it up in a
> > table.
> >
> > Not only is this more expensive in terms of CPU cycles, it also brings
> > additional code size with it.
> >
> > I'm struggling to see what the benefit is.
>
> Currently vm_get_page_prot() is also being _run_ to fetch required page
> protection values. Although that is being run in the core MM and from a
> platform perspective __SXXX, __PXXX are just being exported for a table.
> Looking it up in a table (and applying more constructs there after) is
> not much different than a clean switch case statement in terms of CPU
> usage. So this is not more expensive in terms of CPU cycles.
I disagree.
However, let's base this disagreement on some evidence. Here is the
present 32-bit ARM implementation:
00000048 <vm_get_page_prot>:
48: e200000f and r0, r0, #15
4c: e3003000 movw r3, #0
4c: R_ARM_MOVW_ABS_NC .LANCHOR1
50: e3403000 movt r3, #0
50: R_ARM_MOVT_ABS .LANCHOR1
54: e7930100 ldr r0, [r3, r0, lsl #2]
58: e12fff1e bx lr
That is five instructions long.
Please show that your new implementation is not more expensive on
32-bit ARM. Please do so by building a 32-bit kernel, and providing
the disassembly.
I think you will find way more than five instructions in your version -
the compiler will have to issue code to decode the protection bits,
probably using a table of branches or absolute PC values, or possibly
the worst case of using multiple comparisons and branches. It then has
to load constants that may be moved using movw on ARMv7, but on
older architectures would have to be created from multiple instructions
or loaded from the literal pool. Then there'll be instructions to load
the address of "user_pgprot", retrieve its value, and bitwise or that.
Therefore, I fail to see how your approach of getting rid of the table
is somehow "better" than what we currently have in terms of the effect
on the resulting code.
If you don't like the __P and __S stuff and two arch_* hooks, you could
move the table into arch code along with vm_get_page_prot() without the
additional unnecessary hooks, while keeping all the benefits of the
table lookup.
Thanks.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>> macros can be dropped which are no longer needed.
>>>
>>> What I would really like to know is why having to run _code_ to work out
>>> what the page protections need to be is better than looking it up in a
>>> table.
>>>
>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>> additional code size with it.
>>>
>>> I'm struggling to see what the benefit is.
>>
>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>> protection values. Although that is being run in the core MM and from a
>> platform perspective __SXXX, __PXXX are just being exported for a table.
>> Looking it up in a table (and applying more constructs there after) is
>> not much different than a clean switch case statement in terms of CPU
>> usage. So this is not more expensive in terms of CPU cycles.
>
> I disagree.
So do I.
>
> However, let's base this disagreement on some evidence. Here is the
> present 32-bit ARM implementation:
>
> 00000048 <vm_get_page_prot>:
> 48: e200000f and r0, r0, #15
> 4c: e3003000 movw r3, #0
> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
> 50: e3403000 movt r3, #0
> 50: R_ARM_MOVT_ABS .LANCHOR1
> 54: e7930100 ldr r0, [r3, r0, lsl #2]
> 58: e12fff1e bx lr
>
> That is five instructions long.
On ppc32 I get:
00000094 <vm_get_page_prot>:
94: 3d 20 00 00 lis r9,0
96: R_PPC_ADDR16_HA .data..ro_after_init
98: 54 84 16 ba rlwinm r4,r4,2,26,29
9c: 39 29 00 00 addi r9,r9,0
9e: R_PPC_ADDR16_LO .data..ro_after_init
a0: 7d 29 20 2e lwzx r9,r9,r4
a4: 91 23 00 00 stw r9,0(r3)
a8: 4e 80 00 20 blr
>
> Please show that your new implementation is not more expensive on
> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
> the disassembly.
With your series I get:
00000000 <vm_get_page_prot>:
0: 3d 20 00 00 lis r9,0
2: R_PPC_ADDR16_HA .rodata
4: 39 29 00 00 addi r9,r9,0
6: R_PPC_ADDR16_LO .rodata
8: 54 84 16 ba rlwinm r4,r4,2,26,29
c: 7d 49 20 2e lwzx r10,r9,r4
10: 7d 4a 4a 14 add r10,r10,r9
14: 7d 49 03 a6 mtctr r10
18: 4e 80 04 20 bctr
1c: 39 20 03 15 li r9,789
20: 91 23 00 00 stw r9,0(r3)
24: 4e 80 00 20 blr
28: 39 20 01 15 li r9,277
2c: 91 23 00 00 stw r9,0(r3)
30: 4e 80 00 20 blr
34: 39 20 07 15 li r9,1813
38: 91 23 00 00 stw r9,0(r3)
3c: 4e 80 00 20 blr
40: 39 20 05 15 li r9,1301
44: 91 23 00 00 stw r9,0(r3)
48: 4e 80 00 20 blr
4c: 39 20 01 11 li r9,273
50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
That is definitely more expensive, it implements a table of branches.
Christophe
On 3/1/22 6:01 AM, Russell King (Oracle) wrote:
> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>> macros can be dropped which are no longer needed.
>>> What I would really like to know is why having to run _code_ to work out
>>> what the page protections need to be is better than looking it up in a
>>> table.
>>>
>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>> additional code size with it.
>>>
>>> I'm struggling to see what the benefit is.
>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>> protection values. Although that is being run in the core MM and from a
>> platform perspective __SXXX, __PXXX are just being exported for a table.
>> Looking it up in a table (and applying more constructs there after) is
>> not much different than a clean switch case statement in terms of CPU
>> usage. So this is not more expensive in terms of CPU cycles.
> I disagree.
>
> However, let's base this disagreement on some evidence. Here is the
> present 32-bit ARM implementation:
>
> 00000048 <vm_get_page_prot>:
> 48: e200000f and r0, r0, #15
> 4c: e3003000 movw r3, #0
> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
> 50: e3403000 movt r3, #0
> 50: R_ARM_MOVT_ABS .LANCHOR1
> 54: e7930100 ldr r0, [r3, r0, lsl #2]
> 58: e12fff1e bx lr
>
> That is five instructions long.
>
> Please show that your new implementation is not more expensive on
> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
> the disassembly.
>
> I think you will find way more than five instructions in your version -
> the compiler will have to issue code to decode the protection bits,
> probably using a table of branches or absolute PC values, or possibly
> the worst case of using multiple comparisons and branches. It then has
> to load constants that may be moved using movw on ARMv7, but on
> older architectures would have to be created from multiple instructions
> or loaded from the literal pool. Then there'll be instructions to load
> the address of "user_pgprot", retrieve its value, and bitwise or that.
>
> Therefore, I fail to see how your approach of getting rid of the table
> is somehow "better" than what we currently have in terms of the effect
> on the resulting code.
>
> If you don't like the __P and __S stuff and two arch_* hooks, you could
> move the table into arch code along with vm_get_page_prot() without the
> additional unnecessary hooks, while keeping all the benefits of the
> table lookup.
Okay, will change the arm's vm_get_page_prot() implementation as suggested.
On 3/1/22 1:46 PM, Christophe Leroy wrote:
>
>
> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>>> macros can be dropped which are no longer needed.
>>>>
>>>> What I would really like to know is why having to run _code_ to work out
>>>> what the page protections need to be is better than looking it up in a
>>>> table.
>>>>
>>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>>> additional code size with it.
>>>>
>>>> I'm struggling to see what the benefit is.
>>>
>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>>> protection values. Although that is being run in the core MM and from a
>>> platform perspective __SXXX, __PXXX are just being exported for a table.
>>> Looking it up in a table (and applying more constructs there after) is
>>> not much different than a clean switch case statement in terms of CPU
>>> usage. So this is not more expensive in terms of CPU cycles.
>>
>> I disagree.
>
> So do I.
>
>>
>> However, let's base this disagreement on some evidence. Here is the
>> present 32-bit ARM implementation:
>>
>> 00000048 <vm_get_page_prot>:
>> 48: e200000f and r0, r0, #15
>> 4c: e3003000 movw r3, #0
>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
>> 50: e3403000 movt r3, #0
>> 50: R_ARM_MOVT_ABS .LANCHOR1
>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
>> 58: e12fff1e bx lr
>>
>> That is five instructions long.
>
> On ppc32 I get:
>
> 00000094 <vm_get_page_prot>:
> 94: 3d 20 00 00 lis r9,0
> 96: R_PPC_ADDR16_HA .data..ro_after_init
> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
> 9c: 39 29 00 00 addi r9,r9,0
> 9e: R_PPC_ADDR16_LO .data..ro_after_init
> a0: 7d 29 20 2e lwzx r9,r9,r4
> a4: 91 23 00 00 stw r9,0(r3)
> a8: 4e 80 00 20 blr
>
>
>>
>> Please show that your new implementation is not more expensive on
>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
>> the disassembly.
>
> With your series I get:
>
> 00000000 <vm_get_page_prot>:
> 0: 3d 20 00 00 lis r9,0
> 2: R_PPC_ADDR16_HA .rodata
> 4: 39 29 00 00 addi r9,r9,0
> 6: R_PPC_ADDR16_LO .rodata
> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
> c: 7d 49 20 2e lwzx r10,r9,r4
> 10: 7d 4a 4a 14 add r10,r10,r9
> 14: 7d 49 03 a6 mtctr r10
> 18: 4e 80 04 20 bctr
> 1c: 39 20 03 15 li r9,789
> 20: 91 23 00 00 stw r9,0(r3)
> 24: 4e 80 00 20 blr
> 28: 39 20 01 15 li r9,277
> 2c: 91 23 00 00 stw r9,0(r3)
> 30: 4e 80 00 20 blr
> 34: 39 20 07 15 li r9,1813
> 38: 91 23 00 00 stw r9,0(r3)
> 3c: 4e 80 00 20 blr
> 40: 39 20 05 15 li r9,1301
> 44: 91 23 00 00 stw r9,0(r3)
> 48: 4e 80 00 20 blr
> 4c: 39 20 01 11 li r9,273
> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
>
>
> That is definitely more expensive, it implements a table of branches.
Okay, will split out the PPC32 implementation that retains existing
table look up method. Also planning to keep that inside same file
(arch/powerpc/mm/mmap.c), unless you have a difference preference.
On Wed, Mar 02, 2022 at 04:36:52PM +0530, Anshuman Khandual wrote:
> On 3/2/22 3:35 PM, Geert Uytterhoeven wrote:
> > I doubt the switch() variant would give better code on any platform.
> >
> > What about using tables everywhere, using designated initializers
> > to improve readability?
>
> Designated initializers ? Could you please be more specific. A table look
> up on arm platform would be something like this and arm_protection_map[]
> needs to be updated with user_pgprot like before.
There is *absolutely* nothing wrong with that. Updating it once during
boot is way more efficient than having to compute the value each time
vm_get_page_prot() gets called.
Thanks.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Hi Anshuman,
On Wed, Mar 2, 2022 at 10:51 AM Anshuman Khandual
<[email protected]> wrote:
> On 3/2/22 12:35 PM, Christophe Leroy wrote:
> > Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
> >> On 3/1/22 1:46 PM, Christophe Leroy wrote:
> >>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
> >>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
> >>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
> >>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
> >>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
> >>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
> >>>>>>> macros can be dropped which are no longer needed.
> >>>>>>
> >>>>>> What I would really like to know is why having to run _code_ to work out
> >>>>>> what the page protections need to be is better than looking it up in a
> >>>>>> table.
> >>>>>>
> >>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
> >>>>>> additional code size with it.
> >>>>>>
> >>>>>> I'm struggling to see what the benefit is.
> >>>>>
> >>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
> >>>>> protection values. Although that is being run in the core MM and from a
> >>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
> >>>>> Looking it up in a table (and applying more constructs there after) is
> >>>>> not much different than a clean switch case statement in terms of CPU
> >>>>> usage. So this is not more expensive in terms of CPU cycles.
> >>>>
> >>>> I disagree.
> >>>
> >>> So do I.
> >>>
> >>>>
> >>>> However, let's base this disagreement on some evidence. Here is the
> >>>> present 32-bit ARM implementation:
> >>>>
> >>>> 00000048 <vm_get_page_prot>:
> >>>> 48: e200000f and r0, r0, #15
> >>>> 4c: e3003000 movw r3, #0
> >>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
> >>>> 50: e3403000 movt r3, #0
> >>>> 50: R_ARM_MOVT_ABS .LANCHOR1
> >>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
> >>>> 58: e12fff1e bx lr
> >>>>
> >>>> That is five instructions long.
> >>>
> >>> On ppc32 I get:
> >>>
> >>> 00000094 <vm_get_page_prot>:
> >>> 94: 3d 20 00 00 lis r9,0
> >>> 96: R_PPC_ADDR16_HA .data..ro_after_init
> >>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
> >>> 9c: 39 29 00 00 addi r9,r9,0
> >>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
> >>> a0: 7d 29 20 2e lwzx r9,r9,r4
> >>> a4: 91 23 00 00 stw r9,0(r3)
> >>> a8: 4e 80 00 20 blr
> >>>
> >>>
> >>>>
> >>>> Please show that your new implementation is not more expensive on
> >>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
> >>>> the disassembly.
> >>>
> >>> With your series I get:
> >>>
> >>> 00000000 <vm_get_page_prot>:
> >>> 0: 3d 20 00 00 lis r9,0
> >>> 2: R_PPC_ADDR16_HA .rodata
> >>> 4: 39 29 00 00 addi r9,r9,0
> >>> 6: R_PPC_ADDR16_LO .rodata
> >>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
> >>> c: 7d 49 20 2e lwzx r10,r9,r4
> >>> 10: 7d 4a 4a 14 add r10,r10,r9
> >>> 14: 7d 49 03 a6 mtctr r10
> >>> 18: 4e 80 04 20 bctr
> >>> 1c: 39 20 03 15 li r9,789
> >>> 20: 91 23 00 00 stw r9,0(r3)
> >>> 24: 4e 80 00 20 blr
> >>> 28: 39 20 01 15 li r9,277
> >>> 2c: 91 23 00 00 stw r9,0(r3)
> >>> 30: 4e 80 00 20 blr
> >>> 34: 39 20 07 15 li r9,1813
> >>> 38: 91 23 00 00 stw r9,0(r3)
> >>> 3c: 4e 80 00 20 blr
> >>> 40: 39 20 05 15 li r9,1301
> >>> 44: 91 23 00 00 stw r9,0(r3)
> >>> 48: 4e 80 00 20 blr
> >>> 4c: 39 20 01 11 li r9,273
> >>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
> >>>
> >>>
> >>> That is definitely more expensive, it implements a table of branches.
> >>
> >> Okay, will split out the PPC32 implementation that retains existing
> >> table look up method. Also planning to keep that inside same file
> >> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
> >
> > My point was not to get something specific for PPC32, but to amplify on
> > Russell's objection.
> >
> > As this is bad for ARM and bad for PPC32, do we have any evidence that
> > your change is good for any other architecture ?
> >
> > I checked PPC64 and there is exactly the same drawback. With the current
> > implementation it is a small function performing table read then a few
> > adjustment. After your change it is a bigger function implementing a
> > table of branches.
>
> I am wondering if this would not be the case for any other switch case
> statement on the platform ? Is there something specific/different just
> on vm_get_page_prot() implementation ? Are you suggesting that switch
> case statements should just be avoided instead ?
>
> >
> > So, as requested by Russell, could you look at the disassembly for other
> > architectures and show us that ARM and POWERPC are the only ones for
> > which your change is not optimal ?
>
> But the primary purpose of this series is not to guarantee optimized
> code on platform by platform basis, while migrating from a table based
> look up method into a switch case statement.
>
> But instead, the purposes is to remove current levels of unnecessary
> abstraction while converting a vm_flags access combination into page
> protection. The switch case statement for platform implementation of
> vm_get_page_prot() just seemed logical enough. Christoph's original
> suggestion patch for x86 had the same implementation as well.
>
> But if the table look up is still better/preferred method on certain
> platforms like arm or ppc32, will be happy to preserve that.
I doubt the switch() variant would give better code on any platform.
What about using tables everywhere, using designated initializers
to improve readability?
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
On 3/2/22 3:35 PM, Geert Uytterhoeven wrote:
> Hi Anshuman,
>
> On Wed, Mar 2, 2022 at 10:51 AM Anshuman Khandual
> <[email protected]> wrote:
>> On 3/2/22 12:35 PM, Christophe Leroy wrote:
>>> Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
>>>> On 3/1/22 1:46 PM, Christophe Leroy wrote:
>>>>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
>>>>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>>>>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>>>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>>>>>>> macros can be dropped which are no longer needed.
>>>>>>>>
>>>>>>>> What I would really like to know is why having to run _code_ to work out
>>>>>>>> what the page protections need to be is better than looking it up in a
>>>>>>>> table.
>>>>>>>>
>>>>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>>>>>>> additional code size with it.
>>>>>>>>
>>>>>>>> I'm struggling to see what the benefit is.
>>>>>>>
>>>>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>>>>>>> protection values. Although that is being run in the core MM and from a
>>>>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
>>>>>>> Looking it up in a table (and applying more constructs there after) is
>>>>>>> not much different than a clean switch case statement in terms of CPU
>>>>>>> usage. So this is not more expensive in terms of CPU cycles.
>>>>>>
>>>>>> I disagree.
>>>>>
>>>>> So do I.
>>>>>
>>>>>>
>>>>>> However, let's base this disagreement on some evidence. Here is the
>>>>>> present 32-bit ARM implementation:
>>>>>>
>>>>>> 00000048 <vm_get_page_prot>:
>>>>>> 48: e200000f and r0, r0, #15
>>>>>> 4c: e3003000 movw r3, #0
>>>>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
>>>>>> 50: e3403000 movt r3, #0
>>>>>> 50: R_ARM_MOVT_ABS .LANCHOR1
>>>>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
>>>>>> 58: e12fff1e bx lr
>>>>>>
>>>>>> That is five instructions long.
>>>>>
>>>>> On ppc32 I get:
>>>>>
>>>>> 00000094 <vm_get_page_prot>:
>>>>> 94: 3d 20 00 00 lis r9,0
>>>>> 96: R_PPC_ADDR16_HA .data..ro_after_init
>>>>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
>>>>> 9c: 39 29 00 00 addi r9,r9,0
>>>>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
>>>>> a0: 7d 29 20 2e lwzx r9,r9,r4
>>>>> a4: 91 23 00 00 stw r9,0(r3)
>>>>> a8: 4e 80 00 20 blr
>>>>>
>>>>>
>>>>>>
>>>>>> Please show that your new implementation is not more expensive on
>>>>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
>>>>>> the disassembly.
>>>>>
>>>>> With your series I get:
>>>>>
>>>>> 00000000 <vm_get_page_prot>:
>>>>> 0: 3d 20 00 00 lis r9,0
>>>>> 2: R_PPC_ADDR16_HA .rodata
>>>>> 4: 39 29 00 00 addi r9,r9,0
>>>>> 6: R_PPC_ADDR16_LO .rodata
>>>>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
>>>>> c: 7d 49 20 2e lwzx r10,r9,r4
>>>>> 10: 7d 4a 4a 14 add r10,r10,r9
>>>>> 14: 7d 49 03 a6 mtctr r10
>>>>> 18: 4e 80 04 20 bctr
>>>>> 1c: 39 20 03 15 li r9,789
>>>>> 20: 91 23 00 00 stw r9,0(r3)
>>>>> 24: 4e 80 00 20 blr
>>>>> 28: 39 20 01 15 li r9,277
>>>>> 2c: 91 23 00 00 stw r9,0(r3)
>>>>> 30: 4e 80 00 20 blr
>>>>> 34: 39 20 07 15 li r9,1813
>>>>> 38: 91 23 00 00 stw r9,0(r3)
>>>>> 3c: 4e 80 00 20 blr
>>>>> 40: 39 20 05 15 li r9,1301
>>>>> 44: 91 23 00 00 stw r9,0(r3)
>>>>> 48: 4e 80 00 20 blr
>>>>> 4c: 39 20 01 11 li r9,273
>>>>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
>>>>>
>>>>>
>>>>> That is definitely more expensive, it implements a table of branches.
>>>>
>>>> Okay, will split out the PPC32 implementation that retains existing
>>>> table look up method. Also planning to keep that inside same file
>>>> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
>>>
>>> My point was not to get something specific for PPC32, but to amplify on
>>> Russell's objection.
>>>
>>> As this is bad for ARM and bad for PPC32, do we have any evidence that
>>> your change is good for any other architecture ?
>>>
>>> I checked PPC64 and there is exactly the same drawback. With the current
>>> implementation it is a small function performing table read then a few
>>> adjustment. After your change it is a bigger function implementing a
>>> table of branches.
>>
>> I am wondering if this would not be the case for any other switch case
>> statement on the platform ? Is there something specific/different just
>> on vm_get_page_prot() implementation ? Are you suggesting that switch
>> case statements should just be avoided instead ?
>>
>>>
>>> So, as requested by Russell, could you look at the disassembly for other
>>> architectures and show us that ARM and POWERPC are the only ones for
>>> which your change is not optimal ?
>>
>> But the primary purpose of this series is not to guarantee optimized
>> code on platform by platform basis, while migrating from a table based
>> look up method into a switch case statement.
>>
>> But instead, the purposes is to remove current levels of unnecessary
>> abstraction while converting a vm_flags access combination into page
>> protection. The switch case statement for platform implementation of
>> vm_get_page_prot() just seemed logical enough. Christoph's original
>> suggestion patch for x86 had the same implementation as well.
>>
>> But if the table look up is still better/preferred method on certain
>> platforms like arm or ppc32, will be happy to preserve that.
>
> I doubt the switch() variant would give better code on any platform.
>
> What about using tables everywhere, using designated initializers
> to improve readability?
Designated initializers ? Could you please be more specific. A table look
up on arm platform would be something like this and arm_protection_map[]
needs to be updated with user_pgprot like before. Just wondering how a
designated initializer will help here.
static pgprot_t arm_protection_map[16] __ro_after_init = {
[VM_NONE] = __PAGE_NONE,
[VM_READ] = __PAGE_READONLY,
[VM_WRITE] = __PAGE_COPY,
[VM_WRITE | VM_READ] = __PAGE_COPY,
[VM_EXEC] = __PAGE_READONLY_EXEC,
[VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
[VM_EXEC | VM_WRITE] = __PAGE_COPY_EXEC,
[VM_EXEC | VM_WRITE | VM_READ] = __PAGE_COPY_EXEC,
[VM_SHARED] = __PAGE_NONE,
[VM_SHARED | VM_READ] = __PAGE_READONLY,
[VM_SHARED | VM_WRITE] = __PAGE_SHARED,
[VM_SHARED | VM_WRITE | VM_READ] = __PAGE_SHARED,
[VM_SHARED | VM_EXEC] = __PAGE_READONLY_EXEC,
[VM_SHARED | VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
[VM_SHARED | VM_EXEC | VM_WRITE] = __PAGE_SHARED_EXEC,
[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __PAGE_SHARED_EXEC
};
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
return __pgprot(pgprot_val(arm_protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]));
}
EXPORT_SYMBOL(vm_get_page_prot);
Hi Anshuman,
On Wed, Mar 2, 2022 at 12:07 PM Anshuman Khandual
<[email protected]> wrote:
> On 3/2/22 3:35 PM, Geert Uytterhoeven wrote:
> > On Wed, Mar 2, 2022 at 10:51 AM Anshuman Khandual
> > <[email protected]> wrote:
> >> On 3/2/22 12:35 PM, Christophe Leroy wrote:
> >>> Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
> >>>> On 3/1/22 1:46 PM, Christophe Leroy wrote:
> >>>>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
> >>>>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
> >>>>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
> >>>>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
> >>>>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
> >>>>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
> >>>>>>>>> macros can be dropped which are no longer needed.
> >>>>>>>>
> >>>>>>>> What I would really like to know is why having to run _code_ to work out
> >>>>>>>> what the page protections need to be is better than looking it up in a
> >>>>>>>> table.
> >>>>>>>>
> >>>>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
> >>>>>>>> additional code size with it.
> >>>>>>>>
> >>>>>>>> I'm struggling to see what the benefit is.
> >>>>>>>
> >>>>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
> >>>>>>> protection values. Although that is being run in the core MM and from a
> >>>>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
> >>>>>>> Looking it up in a table (and applying more constructs there after) is
> >>>>>>> not much different than a clean switch case statement in terms of CPU
> >>>>>>> usage. So this is not more expensive in terms of CPU cycles.
> >>>>>>
> >>>>>> I disagree.
> >>>>>
> >>>>> So do I.
> >>>>>
> >>>>>>
> >>>>>> However, let's base this disagreement on some evidence. Here is the
> >>>>>> present 32-bit ARM implementation:
> >>>>>>
> >>>>>> 00000048 <vm_get_page_prot>:
> >>>>>> 48: e200000f and r0, r0, #15
> >>>>>> 4c: e3003000 movw r3, #0
> >>>>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
> >>>>>> 50: e3403000 movt r3, #0
> >>>>>> 50: R_ARM_MOVT_ABS .LANCHOR1
> >>>>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
> >>>>>> 58: e12fff1e bx lr
> >>>>>>
> >>>>>> That is five instructions long.
> >>>>>
> >>>>> On ppc32 I get:
> >>>>>
> >>>>> 00000094 <vm_get_page_prot>:
> >>>>> 94: 3d 20 00 00 lis r9,0
> >>>>> 96: R_PPC_ADDR16_HA .data..ro_after_init
> >>>>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
> >>>>> 9c: 39 29 00 00 addi r9,r9,0
> >>>>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
> >>>>> a0: 7d 29 20 2e lwzx r9,r9,r4
> >>>>> a4: 91 23 00 00 stw r9,0(r3)
> >>>>> a8: 4e 80 00 20 blr
> >>>>>
> >>>>>
> >>>>>>
> >>>>>> Please show that your new implementation is not more expensive on
> >>>>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
> >>>>>> the disassembly.
> >>>>>
> >>>>> With your series I get:
> >>>>>
> >>>>> 00000000 <vm_get_page_prot>:
> >>>>> 0: 3d 20 00 00 lis r9,0
> >>>>> 2: R_PPC_ADDR16_HA .rodata
> >>>>> 4: 39 29 00 00 addi r9,r9,0
> >>>>> 6: R_PPC_ADDR16_LO .rodata
> >>>>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
> >>>>> c: 7d 49 20 2e lwzx r10,r9,r4
> >>>>> 10: 7d 4a 4a 14 add r10,r10,r9
> >>>>> 14: 7d 49 03 a6 mtctr r10
> >>>>> 18: 4e 80 04 20 bctr
> >>>>> 1c: 39 20 03 15 li r9,789
> >>>>> 20: 91 23 00 00 stw r9,0(r3)
> >>>>> 24: 4e 80 00 20 blr
> >>>>> 28: 39 20 01 15 li r9,277
> >>>>> 2c: 91 23 00 00 stw r9,0(r3)
> >>>>> 30: 4e 80 00 20 blr
> >>>>> 34: 39 20 07 15 li r9,1813
> >>>>> 38: 91 23 00 00 stw r9,0(r3)
> >>>>> 3c: 4e 80 00 20 blr
> >>>>> 40: 39 20 05 15 li r9,1301
> >>>>> 44: 91 23 00 00 stw r9,0(r3)
> >>>>> 48: 4e 80 00 20 blr
> >>>>> 4c: 39 20 01 11 li r9,273
> >>>>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
> >>>>>
> >>>>>
> >>>>> That is definitely more expensive, it implements a table of branches.
> >>>>
> >>>> Okay, will split out the PPC32 implementation that retains existing
> >>>> table look up method. Also planning to keep that inside same file
> >>>> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
> >>>
> >>> My point was not to get something specific for PPC32, but to amplify on
> >>> Russell's objection.
> >>>
> >>> As this is bad for ARM and bad for PPC32, do we have any evidence that
> >>> your change is good for any other architecture ?
> >>>
> >>> I checked PPC64 and there is exactly the same drawback. With the current
> >>> implementation it is a small function performing table read then a few
> >>> adjustment. After your change it is a bigger function implementing a
> >>> table of branches.
> >>
> >> I am wondering if this would not be the case for any other switch case
> >> statement on the platform ? Is there something specific/different just
> >> on vm_get_page_prot() implementation ? Are you suggesting that switch
> >> case statements should just be avoided instead ?
> >>
> >>>
> >>> So, as requested by Russell, could you look at the disassembly for other
> >>> architectures and show us that ARM and POWERPC are the only ones for
> >>> which your change is not optimal ?
> >>
> >> But the primary purpose of this series is not to guarantee optimized
> >> code on platform by platform basis, while migrating from a table based
> >> look up method into a switch case statement.
> >>
> >> But instead, the purposes is to remove current levels of unnecessary
> >> abstraction while converting a vm_flags access combination into page
> >> protection. The switch case statement for platform implementation of
> >> vm_get_page_prot() just seemed logical enough. Christoph's original
> >> suggestion patch for x86 had the same implementation as well.
> >>
> >> But if the table look up is still better/preferred method on certain
> >> platforms like arm or ppc32, will be happy to preserve that.
> >
> > I doubt the switch() variant would give better code on any platform.
> >
> > What about using tables everywhere, using designated initializers
> > to improve readability?
>
> Designated initializers ? Could you please be more specific. A table look
> up on arm platform would be something like this and arm_protection_map[]
> needs to be updated with user_pgprot like before. Just wondering how a
> designated initializer will help here.
It's more readable than the original:
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
>
> static pgprot_t arm_protection_map[16] __ro_after_init = {
> [VM_NONE] = __PAGE_NONE,
> [VM_READ] = __PAGE_READONLY,
> [VM_WRITE] = __PAGE_COPY,
> [VM_WRITE | VM_READ] = __PAGE_COPY,
> [VM_EXEC] = __PAGE_READONLY_EXEC,
> [VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
> [VM_EXEC | VM_WRITE] = __PAGE_COPY_EXEC,
> [VM_EXEC | VM_WRITE | VM_READ] = __PAGE_COPY_EXEC,
> [VM_SHARED] = __PAGE_NONE,
> [VM_SHARED | VM_READ] = __PAGE_READONLY,
> [VM_SHARED | VM_WRITE] = __PAGE_SHARED,
> [VM_SHARED | VM_WRITE | VM_READ] = __PAGE_SHARED,
> [VM_SHARED | VM_EXEC] = __PAGE_READONLY_EXEC,
> [VM_SHARED | VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
> [VM_SHARED | VM_EXEC | VM_WRITE] = __PAGE_SHARED_EXEC,
> [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __PAGE_SHARED_EXEC
> };
Yeah, like that.
Seems like you already made such a conversion in
https://lore.kernel.org/all/[email protected]/
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
On 3/2/22 12:35 PM, Christophe Leroy wrote:
>
>
> Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
>>
>>
>> On 3/1/22 1:46 PM, Christophe Leroy wrote:
>>>
>>>
>>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
>>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>>>>> macros can be dropped which are no longer needed.
>>>>>>
>>>>>> What I would really like to know is why having to run _code_ to work out
>>>>>> what the page protections need to be is better than looking it up in a
>>>>>> table.
>>>>>>
>>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>>>>> additional code size with it.
>>>>>>
>>>>>> I'm struggling to see what the benefit is.
>>>>>
>>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>>>>> protection values. Although that is being run in the core MM and from a
>>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
>>>>> Looking it up in a table (and applying more constructs there after) is
>>>>> not much different than a clean switch case statement in terms of CPU
>>>>> usage. So this is not more expensive in terms of CPU cycles.
>>>>
>>>> I disagree.
>>>
>>> So do I.
>>>
>>>>
>>>> However, let's base this disagreement on some evidence. Here is the
>>>> present 32-bit ARM implementation:
>>>>
>>>> 00000048 <vm_get_page_prot>:
>>>> 48: e200000f and r0, r0, #15
>>>> 4c: e3003000 movw r3, #0
>>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
>>>> 50: e3403000 movt r3, #0
>>>> 50: R_ARM_MOVT_ABS .LANCHOR1
>>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
>>>> 58: e12fff1e bx lr
>>>>
>>>> That is five instructions long.
>>>
>>> On ppc32 I get:
>>>
>>> 00000094 <vm_get_page_prot>:
>>> 94: 3d 20 00 00 lis r9,0
>>> 96: R_PPC_ADDR16_HA .data..ro_after_init
>>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
>>> 9c: 39 29 00 00 addi r9,r9,0
>>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
>>> a0: 7d 29 20 2e lwzx r9,r9,r4
>>> a4: 91 23 00 00 stw r9,0(r3)
>>> a8: 4e 80 00 20 blr
>>>
>>>
>>>>
>>>> Please show that your new implementation is not more expensive on
>>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
>>>> the disassembly.
>>>
>>> With your series I get:
>>>
>>> 00000000 <vm_get_page_prot>:
>>> 0: 3d 20 00 00 lis r9,0
>>> 2: R_PPC_ADDR16_HA .rodata
>>> 4: 39 29 00 00 addi r9,r9,0
>>> 6: R_PPC_ADDR16_LO .rodata
>>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
>>> c: 7d 49 20 2e lwzx r10,r9,r4
>>> 10: 7d 4a 4a 14 add r10,r10,r9
>>> 14: 7d 49 03 a6 mtctr r10
>>> 18: 4e 80 04 20 bctr
>>> 1c: 39 20 03 15 li r9,789
>>> 20: 91 23 00 00 stw r9,0(r3)
>>> 24: 4e 80 00 20 blr
>>> 28: 39 20 01 15 li r9,277
>>> 2c: 91 23 00 00 stw r9,0(r3)
>>> 30: 4e 80 00 20 blr
>>> 34: 39 20 07 15 li r9,1813
>>> 38: 91 23 00 00 stw r9,0(r3)
>>> 3c: 4e 80 00 20 blr
>>> 40: 39 20 05 15 li r9,1301
>>> 44: 91 23 00 00 stw r9,0(r3)
>>> 48: 4e 80 00 20 blr
>>> 4c: 39 20 01 11 li r9,273
>>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
>>>
>>>
>>> That is definitely more expensive, it implements a table of branches.
>>
>> Okay, will split out the PPC32 implementation that retains existing
>> table look up method. Also planning to keep that inside same file
>> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
>
> My point was not to get something specific for PPC32, but to amplify on
> Russell's objection.
>
> As this is bad for ARM and bad for PPC32, do we have any evidence that
> your change is good for any other architecture ?
>
> I checked PPC64 and there is exactly the same drawback. With the current
> implementation it is a small function performing table read then a few
> adjustment. After your change it is a bigger function implementing a
> table of branches.
I am wondering if this would not be the case for any other switch case
statement on the platform ? Is there something specific/different just
on vm_get_page_prot() implementation ? Are you suggesting that switch
case statements should just be avoided instead ?
>
> So, as requested by Russell, could you look at the disassembly for other
> architectures and show us that ARM and POWERPC are the only ones for
> which your change is not optimal ?
But the primary purpose of this series is not to guarantee optimized
code on platform by platform basis, while migrating from a table based
look up method into a switch case statement.
But instead, the purposes is to remove current levels of unnecessary
abstraction while converting a vm_flags access combination into page
protection. The switch case statement for platform implementation of
vm_get_page_prot() just seemed logical enough. Christoph's original
suggestion patch for x86 had the same implementation as well.
But if the table look up is still better/preferred method on certain
platforms like arm or ppc32, will be happy to preserve that.
- Anshuman
Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
>
>
> On 3/1/22 1:46 PM, Christophe Leroy wrote:
>>
>>
>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>>>> macros can be dropped which are no longer needed.
>>>>>
>>>>> What I would really like to know is why having to run _code_ to work out
>>>>> what the page protections need to be is better than looking it up in a
>>>>> table.
>>>>>
>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>>>> additional code size with it.
>>>>>
>>>>> I'm struggling to see what the benefit is.
>>>>
>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>>>> protection values. Although that is being run in the core MM and from a
>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
>>>> Looking it up in a table (and applying more constructs there after) is
>>>> not much different than a clean switch case statement in terms of CPU
>>>> usage. So this is not more expensive in terms of CPU cycles.
>>>
>>> I disagree.
>>
>> So do I.
>>
>>>
>>> However, let's base this disagreement on some evidence. Here is the
>>> present 32-bit ARM implementation:
>>>
>>> 00000048 <vm_get_page_prot>:
>>> 48: e200000f and r0, r0, #15
>>> 4c: e3003000 movw r3, #0
>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
>>> 50: e3403000 movt r3, #0
>>> 50: R_ARM_MOVT_ABS .LANCHOR1
>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
>>> 58: e12fff1e bx lr
>>>
>>> That is five instructions long.
>>
>> On ppc32 I get:
>>
>> 00000094 <vm_get_page_prot>:
>> 94: 3d 20 00 00 lis r9,0
>> 96: R_PPC_ADDR16_HA .data..ro_after_init
>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
>> 9c: 39 29 00 00 addi r9,r9,0
>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
>> a0: 7d 29 20 2e lwzx r9,r9,r4
>> a4: 91 23 00 00 stw r9,0(r3)
>> a8: 4e 80 00 20 blr
>>
>>
>>>
>>> Please show that your new implementation is not more expensive on
>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
>>> the disassembly.
>>
>> With your series I get:
>>
>> 00000000 <vm_get_page_prot>:
>> 0: 3d 20 00 00 lis r9,0
>> 2: R_PPC_ADDR16_HA .rodata
>> 4: 39 29 00 00 addi r9,r9,0
>> 6: R_PPC_ADDR16_LO .rodata
>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
>> c: 7d 49 20 2e lwzx r10,r9,r4
>> 10: 7d 4a 4a 14 add r10,r10,r9
>> 14: 7d 49 03 a6 mtctr r10
>> 18: 4e 80 04 20 bctr
>> 1c: 39 20 03 15 li r9,789
>> 20: 91 23 00 00 stw r9,0(r3)
>> 24: 4e 80 00 20 blr
>> 28: 39 20 01 15 li r9,277
>> 2c: 91 23 00 00 stw r9,0(r3)
>> 30: 4e 80 00 20 blr
>> 34: 39 20 07 15 li r9,1813
>> 38: 91 23 00 00 stw r9,0(r3)
>> 3c: 4e 80 00 20 blr
>> 40: 39 20 05 15 li r9,1301
>> 44: 91 23 00 00 stw r9,0(r3)
>> 48: 4e 80 00 20 blr
>> 4c: 39 20 01 11 li r9,273
>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
>>
>>
>> That is definitely more expensive, it implements a table of branches.
>
> Okay, will split out the PPC32 implementation that retains existing
> table look up method. Also planning to keep that inside same file
> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
My point was not to get something specific for PPC32, but to amplify on
Russell's objection.
As this is bad for ARM and bad for PPC32, do we have any evidence that
your change is good for any other architecture ?
I checked PPC64 and there is exactly the same drawback. With the current
implementation it is a small function performing table read then a few
adjustment. After your change it is a bigger function implementing a
table of branches.
So, as requested by Russell, could you look at the disassembly for other
architectures and show us that ARM and POWERPC are the only ones for
which your change is not optimal ?
See below the difference for POWERPC64.
Current implementation:
0000000000000a60 <.vm_get_page_prot>:
a60: 3d 42 00 00 addis r10,r2,0
a62: R_PPC64_TOC16_HA .data..ro_after_init
a64: 78 89 1e 68 rldic r9,r4,3,57
a68: 39 4a 00 00 addi r10,r10,0
a6a: R_PPC64_TOC16_LO .data..ro_after_init
a6c: 74 88 01 00 andis. r8,r4,256
a70: 7d 2a 48 2a ldx r9,r10,r9
a74: 41 82 00 1c beq a90 <.vm_get_page_prot+0x30>
a78: 60 00 00 00 nop
a7c: 60 00 00 00 nop
a80: 48 00 00 18 b a98 <.vm_get_page_prot+0x38>
a84: 60 00 00 00 nop
a88: 60 00 00 00 nop
a8c: 60 00 00 00 nop
a90: 60 00 00 00 nop
a94: 60 00 00 00 nop
a98: 0f e0 00 00 twui r0,0
a9c: 60 00 00 00 nop
aa0: 38 80 00 10 li r4,16
aa4: 7d 29 23 78 or r9,r9,r4
aa8: f9 23 00 00 std r9,0(r3)
aac: 4e 80 00 20 blr
ab0: 78 84 d9 04 rldicr r4,r4,27,4
ab4: 78 84 e8 c2 rldicl r4,r4,61,3
ab8: 60 84 00 10 ori r4,r4,16
abc: 4b ff ff e8 b aa4 <.vm_get_page_prot+0x44>
ac0: 78 84 d9 04 rldicr r4,r4,27,4
ac4: 78 84 e8 c2 rldicl r4,r4,61,3
ac8: 4b ff ff dc b aa4 <.vm_get_page_prot+0x44>
With your series:
00000000000005b0 <.vm_get_page_prot>:
5b0: 3d 22 00 00 addis r9,r2,0
5b2: R_PPC64_TOC16_HA .toc+0x10
5b4: e9 49 00 00 ld r10,0(r9)
5b6: R_PPC64_TOC16_LO_DS .toc+0x10
5b8: 78 89 16 a8 rldic r9,r4,2,58
5bc: 7d 2a 4a aa lwax r9,r10,r9
5c0: 7d 29 52 14 add r9,r9,r10
5c4: 7d 29 03 a6 mtctr r9
5c8: 3d 20 80 00 lis r9,-32768
5cc: 79 29 07 c6 rldicr r9,r9,32,31
5d0: 4e 80 04 20 bctr
5d4: 00 00 00 ec .long 0xec
5d8: 00 00 00 6c .long 0x6c
5dc: 00 00 00 6c .long 0x6c
5e0: 00 00 00 6c .long 0x6c
5e4: 00 00 00 4c .long 0x4c
5e8: 00 00 00 4c .long 0x4c
5ec: 00 00 00 4c .long 0x4c
5f0: 00 00 00 4c .long 0x4c
5f4: 00 00 00 ec .long 0xec
5f8: 00 00 00 6c .long 0x6c
5fc: 00 00 00 cc .long 0xcc
600: 00 00 00 cc .long 0xcc
604: 00 00 00 4c .long 0x4c
608: 00 00 00 4c .long 0x4c
60c: 00 00 00 dc .long 0xdc
610: 00 00 00 dc .long 0xdc
614: 60 00 00 00 nop
618: 60 00 00 00 nop
61c: 60 00 00 00 nop
620: 61 29 01 05 ori r9,r9,261
624: 74 8a 01 00 andis. r10,r4,256
628: 41 82 00 24 beq 64c <.vm_get_page_prot+0x9c>
62c: 60 00 00 00 nop
630: 60 00 00 00 nop
634: 0f e0 00 00 twui r0,0
638: 60 00 00 00 nop
63c: 60 00 00 00 nop
640: 74 8a 01 00 andis. r10,r4,256
644: 61 29 01 04 ori r9,r9,260
648: 40 82 ff e4 bne 62c <.vm_get_page_prot+0x7c>
64c: 60 00 00 00 nop
650: 60 00 00 00 nop
654: 4b ff ff e0 b 634 <.vm_get_page_prot+0x84>
658: 60 00 00 00 nop
65c: 60 00 00 00 nop
660: 78 84 d9 04 rldicr r4,r4,27,4
664: 78 84 e8 c2 rldicl r4,r4,61,3
668: 7d 29 23 78 or r9,r9,r4
66c: f9 23 00 00 std r9,0(r3)
670: 4e 80 00 20 blr
674: 60 00 00 00 nop
678: 60 00 00 00 nop
67c: 60 00 00 00 nop
680: 38 80 00 10 li r4,16
684: 4b ff ff e4 b 668 <.vm_get_page_prot+0xb8>
688: 60 00 00 00 nop
68c: 60 00 00 00 nop
690: 78 84 d9 04 rldicr r4,r4,27,4
694: 78 84 e8 c2 rldicl r4,r4,61,3
698: 60 84 00 10 ori r4,r4,16
69c: 4b ff ff cc b 668 <.vm_get_page_prot+0xb8>
6a0: 61 29 01 06 ori r9,r9,262
6a4: 4b ff ff 80 b 624 <.vm_get_page_prot+0x74>
6a8: 60 00 00 00 nop
6ac: 60 00 00 00 nop
6b0: 61 29 01 07 ori r9,r9,263
6b4: 4b ff ff 70 b 624 <.vm_get_page_prot+0x74>
6b8: 60 00 00 00 nop
6bc: 60 00 00 00 nop
6c0: 61 29 01 08 ori r9,r9,264
6c4: 4b ff ff 60 b 624 <.vm_get_page_prot+0x74>
Thanks
Christophe
On 3/2/22 16:44, Geert Uytterhoeven wrote:
> Hi Anshuman,
>
> On Wed, Mar 2, 2022 at 12:07 PM Anshuman Khandual
> <[email protected]> wrote:
>> On 3/2/22 3:35 PM, Geert Uytterhoeven wrote:
>>> On Wed, Mar 2, 2022 at 10:51 AM Anshuman Khandual
>>> <[email protected]> wrote:
>>>> On 3/2/22 12:35 PM, Christophe Leroy wrote:
>>>>> Le 02/03/2022 à 04:22, Anshuman Khandual a écrit :
>>>>>> On 3/1/22 1:46 PM, Christophe Leroy wrote:
>>>>>>> Le 01/03/2022 à 01:31, Russell King (Oracle) a écrit :
>>>>>>>> On Tue, Mar 01, 2022 at 05:30:41AM +0530, Anshuman Khandual wrote:
>>>>>>>>> On 2/28/22 4:27 PM, Russell King (Oracle) wrote:
>>>>>>>>>> On Mon, Feb 28, 2022 at 04:17:32PM +0530, Anshuman Khandual wrote:
>>>>>>>>>>> This defines and exports a platform specific custom vm_get_page_prot() via
>>>>>>>>>>> subscribing ARCH_HAS_VM_GET_PAGE_PROT. Subsequently all __SXXX and __PXXX
>>>>>>>>>>> macros can be dropped which are no longer needed.
>>>>>>>>>>
>>>>>>>>>> What I would really like to know is why having to run _code_ to work out
>>>>>>>>>> what the page protections need to be is better than looking it up in a
>>>>>>>>>> table.
>>>>>>>>>>
>>>>>>>>>> Not only is this more expensive in terms of CPU cycles, it also brings
>>>>>>>>>> additional code size with it.
>>>>>>>>>>
>>>>>>>>>> I'm struggling to see what the benefit is.
>>>>>>>>>
>>>>>>>>> Currently vm_get_page_prot() is also being _run_ to fetch required page
>>>>>>>>> protection values. Although that is being run in the core MM and from a
>>>>>>>>> platform perspective __SXXX, __PXXX are just being exported for a table.
>>>>>>>>> Looking it up in a table (and applying more constructs there after) is
>>>>>>>>> not much different than a clean switch case statement in terms of CPU
>>>>>>>>> usage. So this is not more expensive in terms of CPU cycles.
>>>>>>>>
>>>>>>>> I disagree.
>>>>>>>
>>>>>>> So do I.
>>>>>>>
>>>>>>>>
>>>>>>>> However, let's base this disagreement on some evidence. Here is the
>>>>>>>> present 32-bit ARM implementation:
>>>>>>>>
>>>>>>>> 00000048 <vm_get_page_prot>:
>>>>>>>> 48: e200000f and r0, r0, #15
>>>>>>>> 4c: e3003000 movw r3, #0
>>>>>>>> 4c: R_ARM_MOVW_ABS_NC .LANCHOR1
>>>>>>>> 50: e3403000 movt r3, #0
>>>>>>>> 50: R_ARM_MOVT_ABS .LANCHOR1
>>>>>>>> 54: e7930100 ldr r0, [r3, r0, lsl #2]
>>>>>>>> 58: e12fff1e bx lr
>>>>>>>>
>>>>>>>> That is five instructions long.
>>>>>>>
>>>>>>> On ppc32 I get:
>>>>>>>
>>>>>>> 00000094 <vm_get_page_prot>:
>>>>>>> 94: 3d 20 00 00 lis r9,0
>>>>>>> 96: R_PPC_ADDR16_HA .data..ro_after_init
>>>>>>> 98: 54 84 16 ba rlwinm r4,r4,2,26,29
>>>>>>> 9c: 39 29 00 00 addi r9,r9,0
>>>>>>> 9e: R_PPC_ADDR16_LO .data..ro_after_init
>>>>>>> a0: 7d 29 20 2e lwzx r9,r9,r4
>>>>>>> a4: 91 23 00 00 stw r9,0(r3)
>>>>>>> a8: 4e 80 00 20 blr
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Please show that your new implementation is not more expensive on
>>>>>>>> 32-bit ARM. Please do so by building a 32-bit kernel, and providing
>>>>>>>> the disassembly.
>>>>>>>
>>>>>>> With your series I get:
>>>>>>>
>>>>>>> 00000000 <vm_get_page_prot>:
>>>>>>> 0: 3d 20 00 00 lis r9,0
>>>>>>> 2: R_PPC_ADDR16_HA .rodata
>>>>>>> 4: 39 29 00 00 addi r9,r9,0
>>>>>>> 6: R_PPC_ADDR16_LO .rodata
>>>>>>> 8: 54 84 16 ba rlwinm r4,r4,2,26,29
>>>>>>> c: 7d 49 20 2e lwzx r10,r9,r4
>>>>>>> 10: 7d 4a 4a 14 add r10,r10,r9
>>>>>>> 14: 7d 49 03 a6 mtctr r10
>>>>>>> 18: 4e 80 04 20 bctr
>>>>>>> 1c: 39 20 03 15 li r9,789
>>>>>>> 20: 91 23 00 00 stw r9,0(r3)
>>>>>>> 24: 4e 80 00 20 blr
>>>>>>> 28: 39 20 01 15 li r9,277
>>>>>>> 2c: 91 23 00 00 stw r9,0(r3)
>>>>>>> 30: 4e 80 00 20 blr
>>>>>>> 34: 39 20 07 15 li r9,1813
>>>>>>> 38: 91 23 00 00 stw r9,0(r3)
>>>>>>> 3c: 4e 80 00 20 blr
>>>>>>> 40: 39 20 05 15 li r9,1301
>>>>>>> 44: 91 23 00 00 stw r9,0(r3)
>>>>>>> 48: 4e 80 00 20 blr
>>>>>>> 4c: 39 20 01 11 li r9,273
>>>>>>> 50: 4b ff ff d0 b 20 <vm_get_page_prot+0x20>
>>>>>>>
>>>>>>>
>>>>>>> That is definitely more expensive, it implements a table of branches.
>>>>>>
>>>>>> Okay, will split out the PPC32 implementation that retains existing
>>>>>> table look up method. Also planning to keep that inside same file
>>>>>> (arch/powerpc/mm/mmap.c), unless you have a difference preference.
>>>>>
>>>>> My point was not to get something specific for PPC32, but to amplify on
>>>>> Russell's objection.
>>>>>
>>>>> As this is bad for ARM and bad for PPC32, do we have any evidence that
>>>>> your change is good for any other architecture ?
>>>>>
>>>>> I checked PPC64 and there is exactly the same drawback. With the current
>>>>> implementation it is a small function performing table read then a few
>>>>> adjustment. After your change it is a bigger function implementing a
>>>>> table of branches.
>>>>
>>>> I am wondering if this would not be the case for any other switch case
>>>> statement on the platform ? Is there something specific/different just
>>>> on vm_get_page_prot() implementation ? Are you suggesting that switch
>>>> case statements should just be avoided instead ?
>>>>
>>>>>
>>>>> So, as requested by Russell, could you look at the disassembly for other
>>>>> architectures and show us that ARM and POWERPC are the only ones for
>>>>> which your change is not optimal ?
>>>>
>>>> But the primary purpose of this series is not to guarantee optimized
>>>> code on platform by platform basis, while migrating from a table based
>>>> look up method into a switch case statement.
>>>>
>>>> But instead, the purposes is to remove current levels of unnecessary
>>>> abstraction while converting a vm_flags access combination into page
>>>> protection. The switch case statement for platform implementation of
>>>> vm_get_page_prot() just seemed logical enough. Christoph's original
>>>> suggestion patch for x86 had the same implementation as well.
>>>>
>>>> But if the table look up is still better/preferred method on certain
>>>> platforms like arm or ppc32, will be happy to preserve that.
>>>
>>> I doubt the switch() variant would give better code on any platform.
>>>
>>> What about using tables everywhere, using designated initializers
>>> to improve readability?
>>
>> Designated initializers ? Could you please be more specific. A table look
>> up on arm platform would be something like this and arm_protection_map[]
>> needs to be updated with user_pgprot like before. Just wondering how a
>> designated initializer will help here.
>
> It's more readable than the original:
>
> pgprot_t protection_map[16] __ro_after_init = {
> __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
> __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
> };
>
>>
>> static pgprot_t arm_protection_map[16] __ro_after_init = {
>> [VM_NONE] = __PAGE_NONE,
>> [VM_READ] = __PAGE_READONLY,
>> [VM_WRITE] = __PAGE_COPY,
>> [VM_WRITE | VM_READ] = __PAGE_COPY,
>> [VM_EXEC] = __PAGE_READONLY_EXEC,
>> [VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
>> [VM_EXEC | VM_WRITE] = __PAGE_COPY_EXEC,
>> [VM_EXEC | VM_WRITE | VM_READ] = __PAGE_COPY_EXEC,
>> [VM_SHARED] = __PAGE_NONE,
>> [VM_SHARED | VM_READ] = __PAGE_READONLY,
>> [VM_SHARED | VM_WRITE] = __PAGE_SHARED,
>> [VM_SHARED | VM_WRITE | VM_READ] = __PAGE_SHARED,
>> [VM_SHARED | VM_EXEC] = __PAGE_READONLY_EXEC,
>> [VM_SHARED | VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC,
>> [VM_SHARED | VM_EXEC | VM_WRITE] = __PAGE_SHARED_EXEC,
>> [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __PAGE_SHARED_EXEC
>> };
>
> Yeah, like that.
>
> Seems like you already made such a conversion in
> https://lore.kernel.org/all/[email protected]/
Will rework the series in two different phases as mentioned on the other thread.