With Highmem support, the kernel can map more than 1GB physical memory.
This patchset implements Highmem for RV32, referencing to mostly nds32
and others like arm and mips, and it has been tested on Andes A25MP platform.
Eric Lin (3):
riscv/mm: Add pkmap region and CONFIG_HIGHMEM
riscv/mm: Implement kmap() and kmap_atomic()
riscv/mm: Add pkmap in print_vm_layout()
arch/riscv/Kconfig | 18 +++++++
arch/riscv/include/asm/fixmap.h | 9 +++-
arch/riscv/include/asm/highmem.h | 49 +++++++++++++++++
arch/riscv/include/asm/pgtable.h | 27 ++++++++++
arch/riscv/mm/Makefile | 1 +
arch/riscv/mm/highmem.c | 74 +++++++++++++++++++++++++
arch/riscv/mm/init.c | 92 ++++++++++++++++++++++++++++++--
7 files changed, 266 insertions(+), 4 deletions(-)
create mode 100644 arch/riscv/include/asm/highmem.h
create mode 100644 arch/riscv/mm/highmem.c
--
2.17.0
When enabling CONFIG_HIGHMEM, lowmem will before pkmap
region and the memory layout will be like as below:
Virtual kernel memory layout:
lowmem : 0xc0000000 - 0xf5400000 ( 852 MB)
pkmap : 0xf5600000 - 0xf5800000 ( 2 MB)
fixmap : 0xf5800000 - 0xf5c00000 (4096 kB)
pci io : 0xf5c00000 - 0xf6c00000 ( 16 MB)
vmemmap : 0xf6c00000 - 0xf7bfffff ( 15 MB)
vmalloc : 0xf7c00000 - 0xffc00000 ( 128 MB)
Signed-off-by: Eric Lin <[email protected]>
Cc: Alan Kao <[email protected]>
---
arch/riscv/mm/init.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 59afb479176a..b32d558e3f99 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -80,6 +80,12 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t)
static void print_vm_layout(void)
{
pr_notice("Virtual kernel memory layout:\n");
+#ifdef CONFIG_HIGHMEM
+ print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
+ (unsigned long)high_memory);
+ print_mlm("pkmap", (unsigned long)PKMAP_BASE,
+ (unsigned long)FIXADDR_START);
+#endif
print_mlk("fixmap", (unsigned long)FIXADDR_START,
(unsigned long)FIXADDR_TOP);
print_mlm("pci io", (unsigned long)PCI_IO_START,
@@ -88,8 +94,10 @@ static void print_vm_layout(void)
(unsigned long)VMEMMAP_END);
print_mlm("vmalloc", (unsigned long)VMALLOC_START,
(unsigned long)VMALLOC_END);
+#ifndef CONFIG_HIGHMEM
print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
(unsigned long)high_memory);
+#endif
}
#else
static void print_vm_layout(void) { }
--
2.17.0
Both kmap() and kmap_atomic() help kernel to create
temporary mappings from a highmem page.
Be aware that use kmap() might put calling function to sleep
and it cannot use in interrupt context. kmap_atomic() is an
atomic version of kmap() which can be used in interrupt context
and it is faster than kmap() because it doesn't hold a lock.
Here we preserve some memory slots from fixmap region for
kmap_atomic() and kmap() will use pkmap region.
Signed-off-by: Eric Lin <[email protected]>
Cc: Alan Kao <[email protected]>
---
arch/riscv/include/asm/fixmap.h | 9 +++-
arch/riscv/include/asm/highmem.h | 30 +++++++++++++
arch/riscv/include/asm/pgtable.h | 5 +++
arch/riscv/mm/Makefile | 1 +
arch/riscv/mm/highmem.c | 74 ++++++++++++++++++++++++++++++++
5 files changed, 118 insertions(+), 1 deletion(-)
create mode 100644 arch/riscv/mm/highmem.c
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 42d2c42f3cc9..8dedc2bf2917 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2020 Andes Technology Corporation
*/
#ifndef _ASM_RISCV_FIXMAP_H
@@ -10,6 +11,7 @@
#include <linux/sizes.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/kmap_types.h>
#ifdef CONFIG_MMU
/*
@@ -28,7 +30,12 @@ enum fixed_addresses {
FIX_PTE,
FIX_PMD,
FIX_EARLYCON_MEM_BASE,
- __end_of_fixed_addresses
+#ifdef CONFIG_HIGHMEM
+ FIX_KMAP_RESERVED,
+ FIX_KMAP_BEGIN,
+ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS),
+#endif
+ __end_of_fixed_addresses,
};
#define FIXMAP_PAGE_IO PAGE_KERNEL
diff --git a/arch/riscv/include/asm/highmem.h b/arch/riscv/include/asm/highmem.h
index 7fc79e58f607..ec7c83d55830 100644
--- a/arch/riscv/include/asm/highmem.h
+++ b/arch/riscv/include/asm/highmem.h
@@ -17,3 +17,33 @@
#define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
#define kmap_prot PAGE_KERNEL
+
+static inline void flush_cache_kmaps(void)
+{
+ flush_cache_all();
+}
+
+/* Declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+extern pte_t *pkmap_page_table;
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+extern void kmap_init(void);
+
+/*
+ * The following functions are already defined by <linux/highmem.h>
+ * when CONFIG_HIGHMEM is not set.
+ */
+#ifdef CONFIG_HIGHMEM
+extern void *kmap(struct page *page);
+extern void kunmap(struct page *page);
+extern void *kmap_atomic(struct page *page);
+extern void __kunmap_atomic(void *kvaddr);
+extern void *kmap_atomic_pfn(unsigned long pfn);
+extern struct page *kmap_atomic_to_page(void *ptr);
+#endif
+
+#endif
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index d9a3769f1f4e..1a774d5a8bbc 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -200,6 +200,11 @@ static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
/* Locate an entry in the kernel page global directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, (addr))
+#ifdef CONFIG_HIGHMEM
+/* Locate an entry in the second-level page table */
+#define pmd_off_k(addr) pmd_offset((pud_t *)pgd_offset_k(addr), addr)
+#endif
+
static inline struct page *pmd_page(pmd_t pmd)
{
return pfn_to_page(pmd_val(pmd) >> _PAGE_PFN_SHIFT);
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index 50b7af58c566..6f9305afc632 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_SMP) += tlbflush.o
endif
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_KASAN) += kasan_init.o
+obj-$(CONFIG_HIGHMEM) += highmem.o
ifdef CONFIG_KASAN
KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/riscv/mm/highmem.c b/arch/riscv/mm/highmem.c
new file mode 100644
index 000000000000..b01ebe34619e
--- /dev/null
+++ b/arch/riscv/mm/highmem.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2005-2020 Andes Technology Corporation
+ */
+
+#include <linux/export.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+void *kmap(struct page *page)
+{
+ unsigned long vaddr;
+
+ might_sleep();
+ if (!PageHighMem(page))
+ return page_address(page);
+ vaddr = (unsigned long)kmap_high(page);
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+ BUG_ON(in_interrupt());
+ if (!PageHighMem(page))
+ return;
+ kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+void *kmap_atomic(struct page *page)
+{
+ unsigned int idx;
+ unsigned long vaddr;
+ int type;
+ pte_t *ptep;
+
+ preempt_disable();
+ pagefault_disable();
+
+ if (!PageHighMem(page))
+ return page_address(page);
+
+ type = kmap_atomic_idx_push();
+
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
+ set_pte(ptep, mk_pte(page, kmap_prot));
+
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic);
+
+void __kunmap_atomic(void *kvaddr)
+{
+ if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) {
+ unsigned long vaddr = (unsigned long)kvaddr;
+ pte_t *ptep;
+
+ kmap_atomic_idx_pop();
+ ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
+ set_pte(ptep, __pte(0));
+ }
+ pagefault_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
--
2.17.0
On Tue, Mar 31, 2020 at 11:34 AM Eric Lin <[email protected]> wrote:
>
> With Highmem support, the kernel can map more than 1GB physical memory.
>
> This patchset implements Highmem for RV32, referencing to mostly nds32
> and others like arm and mips, and it has been tested on Andes A25MP platform.
I would much prefer to not see highmem added to new architectures at all
if possible, see https://lwn.net/Articles/813201/ for some background.
For the arm32 architecture, we are thinking about implementing a
VMPLIT_4G_4G option to replace highmem in the long run. The most
likely way this would turn out at the moment looks like:
- have a 256MB region for vmalloc space at the top of the 4GB address
space, containing vmlinux, module, mmio mappings and vmalloc
allocations
- have 3.75GB starting at address zero for either user space or the
linear map.
- reserve one address space ID for kernel mappings to avoid tlb flushes
during normal context switches
- On any kernel entry, switch the page table to the one with the linear
mapping, and back to the user page table before returning to user space
- add a generic copy_from_user/copy_to_user implementation based
on get_user_pages() in asm-generic/uaccess.h, using memcpy()
to copy from/to the page in the linear map.
- possible have architectures override get_user/put_user to use a
cheaper access based on a page table switch to read individual
words if that is cheaper than get_user_pages().
There was an implementation of this for x86 a long time ago, but
it never got merged, mainly because there were no ASIDs on x86
at the time and the TLB flushing during context switch were really
expensive. As far as I can tell, all of the modern embedded cores
do have ASIDs, and unlike x86, most do not support more than 4GB
of physical RAM, so this scheme can work to replace highmem
in most of the remaining cases, and provide additional benefits
(larger user address range, higher separate of kernel/user addresses)
at a relatively small performance cost.
Arnd
On Thu, Apr 02, 2020 at 11:31:37AM +0200, Arnd Bergmann wrote:
> On Tue, Mar 31, 2020 at 11:34 AM Eric Lin <[email protected]> wrote:
> >
> > With Highmem support, the kernel can map more than 1GB physical memory.
> >
> > This patchset implements Highmem for RV32, referencing to mostly nds32
> > and others like arm and mips, and it has been tested on Andes A25MP platform.
>
> I would much prefer to not see highmem added to new architectures at all
> if possible, see https://lwn.net/Articles/813201/ for some background.
>
Understood.
> For the arm32 architecture, we are thinking about implementing a
> VMPLIT_4G_4G option to replace highmem in the long run. The most
> likely way this would turn out at the moment looks like:
>
Thanks for sharing the status from ARM32. Is there any available branch
already? It would be good to have a reference implementation.
> - have a 256MB region for vmalloc space at the top of the 4GB address
> space, containing vmlinux, module, mmio mappings and vmalloc
> allocations
>
> - have 3.75GB starting at address zero for either user space or the
> linear map.
>
> - reserve one address space ID for kernel mappings to avoid tlb flushes
> during normal context switches
>
> - On any kernel entry, switch the page table to the one with the linear
> mapping, and back to the user page table before returning to user space
>
After some survey I found previous disccusion
(https://lkml.org/lkml/2019/4/24/2110). The 5.2-based patch ended up not
being merged. But at least we will have something to start if we want to.
Also interestingly, there was a PR for privileged spec that separates
addressing modes (https://github.com/riscv/riscv-isa-manual/pull/128) as
Sdas extension, but there was no progress afterwards.
Not very related to this thread, but there were some discussion about
ASID design in RISC-V (https://github.com/riscv/riscv-isa-manual/issues/348).
It is now in ratified 1.11 privileged spec.
> - add a generic copy_from_user/copy_to_user implementation based
> on get_user_pages() in asm-generic/uaccess.h, using memcpy()
> to copy from/to the page in the linear map.
>
> - possible have architectures override get_user/put_user to use a
> cheaper access based on a page table switch to read individual
> words if that is cheaper than get_user_pages().
>
> There was an implementation of this for x86 a long time ago, but
> it never got merged, mainly because there were no ASIDs on x86
> at the time and the TLB flushing during context switch were really
> expensive. As far as I can tell, all of the modern embedded cores
> do have ASIDs, and unlike x86, most do not support more than 4GB
> of physical RAM, so this scheme can work to replace highmem
> in most of the remaining cases, and provide additional benefits
> (larger user address range, higher separate of kernel/user addresses)
> at a relatively small performance cost.
>
> Arnd
>
It seems to me that VMSPLIT_4G_4G is quite different from other VMSPLITs,
because it requires much more changes.
Thanks for showing the stance of kernel community against HIGHMEM support.
The cited discussion thread is comprehensive and clear. Despite that RV32
users cannot get upstream support for their large memory, mechnisms like
VMSPLIT_4G_4G seems to be a promising way to go. That being said, to
support the theoretical 16G physical memory, eventually kmap* will still
be needed.
Alan
On Wed, Apr 8, 2020 at 5:52 AM Alan Kao <[email protected]> wrote:
> On Thu, Apr 02, 2020 at 11:31:37AM +0200, Arnd Bergmann wrote:
> > On Tue, Mar 31, 2020 at 11:34 AM Eric Lin <[email protected]> wrote:
> > For the arm32 architecture, we are thinking about implementing a
> > VMPLIT_4G_4G option to replace highmem in the long run. The most
> > likely way this would turn out at the moment looks like:
> >
>
> Thanks for sharing the status from ARM32. Is there any available branch
> already? It would be good to have a reference implementation.
No code yet, so far not much more than the ideas that I listed. We
are currently looking for someone interested in doing the work
or maybe sponsoring it if they have a strong interest.
If someone does it for RISC-V first, that would of course also help on ARM ;-)
> > - have a 256MB region for vmalloc space at the top of the 4GB address
> > space, containing vmlinux, module, mmio mappings and vmalloc
> > allocations
> >
> > - have 3.75GB starting at address zero for either user space or the
> > linear map.
> >
> > - reserve one address space ID for kernel mappings to avoid tlb flushes
> > during normal context switches
> >
> > - On any kernel entry, switch the page table to the one with the linear
> > mapping, and back to the user page table before returning to user space
> >
>
> After some survey I found previous disccusion
> (https://lkml.org/lkml/2019/4/24/2110). The 5.2-based patch ended up not
> being merged. But at least we will have something to start if we want to.
Ah, I see. What is the current requirement for ASIDs in hardware
implementations? If support for more than one address space is
optional, that would make the VMSPLIT_4G support fairly expensive
as it requires a full TLB flush for each context switch.
> Also interestingly, there was a PR for privileged spec that separates
> addressing modes (https://github.com/riscv/riscv-isa-manual/pull/128) as
> Sdas extension, but there was no progress afterwards.
Right, this sounds like the ideal implementation. This is what is done
in arch/s390 and probably a few of the others.
> Not very related to this thread, but there were some discussion about
> ASID design in RISC-V (https://github.com/riscv/riscv-isa-manual/issues/348).
> It is now in ratified 1.11 privileged spec.
Ok, so I suppose that would apply to about half the 32-bit implementations
and most of the future ones, but not the ones implementing the 1.10 spec
or earlier, right?
> It seems to me that VMSPLIT_4G_4G is quite different from other VMSPLITs,
> because it requires much more changes.
>
> Thanks for showing the stance of kernel community against HIGHMEM support.
> The cited discussion thread is comprehensive and clear. Despite that RV32
> users cannot get upstream support for their large memory, mechnisms like
> VMSPLIT_4G_4G seems to be a promising way to go. That being said, to
> support the theoretical 16G physical memory, eventually kmap* will still
> be needed.
I had not realized that Sv32 supports more than 4GB physical address
space at all. I agree that if someone puts that much RAM into a machine,
there are few alternatives to highmem (in theory one could use the
extra RAM for zswap/zram, but that's not a good replacement).
OTOH actually using more than 1GB or 2GB of physical memory on a
32-bit core is something that I expect to become completely obscure
in the future, as this is where using 32-bit cores tends to get
uneconomical. The situation that I observe across the currently supported
32-bit architectures in the kernel is that:
- There is an incentive to run 32-bit on machines with 1GB of RAM or less
if you have the choice, because of higher memory consumption and
cache utilization on 64-bit code. On systems with 2GB or more, the
cost of managing that memory using 32-bit code usually outweighs
the benefits and you should run at least a 64-bit kernel.
- The high end 32-bit cores (Arm Cortex-A15/A17, MIPS P5600,
PowerPC 750, Intel Pentium 4, Andes A15/D15, ...) are all obsolete
after the follow-on products use 64-bit cores on a smaller process
node, which end up being more capable, faster *and* cheaper.
- The 32-bit cores that do survive are based on simpler in-order
pipelines that are cheaper and can still beat the 64-bit cores in
terms of cost (mostly chip area, sometimes royalties), but not
performance. This includes Arm Cortex-A7, MIPS 24k and typical
RV32 cores.
- On an SoC with a cheap and simple CPU core, there is no point
in spending a lot of money/area/complexity on a high-end memory
controller. On single-core 32-bit SoCs, you usually end up with single
16 or 32-bit wide DDR2 memory controller, on an SMP system like
most quad-Cortex-A7, you have a 32-bit wide DDR3 controller, but no
DDR4 or LP-DDR3/4.
- The largest economical memory configuration on a 32-bit DDR3
controller is to have two 256Mx16 chips for a total of 1GB. You can
get 2GB with four chips using dual-channel controllers or 512Mx8
memory, but anything beyond that is much more expensive than
upgrading to a 64-bit SoC with LP-DDR4.
This is unlikely to change over time as 64-bit chips are also getting
cheaper and may replace more of the 32-bit chips we see today.
In particular, I expect to see multi-core chips moving to mostly
64-bit cores over time, while 32-bit chips keep using one or
occasionally two cores, further reducing the need for large and/or
fast memory.
Arnd
+ rmk
Hi Arnd,
On Wed, Apr 08, 2020 at 04:40:17PM +0200, Arnd Bergmann wrote:
> No code yet, so far not much more than the ideas that I listed. We
> are currently looking for someone interested in doing the work
> or maybe sponsoring it if they have a strong interest.
If no one have yet taken it up, i am interested in doing the work, i
will sponsor myself :). i will proceed at a slow pace without derailing
my other things normal.
To keep expectations realistic: i have not yet taken task of this
complexity, it is more of a learning for me. My familiarity with Linux
at the ARM architecture level is mostly on no-MMU (Cortex-M), have not
worked so far seriously on MMU Linux at the ARM architectural level,
though used to go through ARM ARM v7-AR at times.
i have a few 32-bit ARM Cortex-A (A5, A8 & A9) boards, maximum RAM 1G,
none have LPAE, seems i have to buy one for this purpose.
And if someone else wants to do it, i will not take it up.
Regards
afzal
On Tue, Apr 14, 2020 at 5:17 PM afzal mohammed <[email protected]> wrote:
>
> + rmk
>
> Hi Arnd,
>
> On Wed, Apr 08, 2020 at 04:40:17PM +0200, Arnd Bergmann wrote:
>
> > No code yet, so far not much more than the ideas that I listed. We
> > are currently looking for someone interested in doing the work
> > or maybe sponsoring it if they have a strong interest.
>
> If no one have yet taken it up, i am interested in doing the work, i
> will sponsor myself :). i will proceed at a slow pace without derailing
> my other things normal.
>
> To keep expectations realistic: i have not yet taken task of this
> complexity, it is more of a learning for me. My familiarity with Linux
> at the ARM architecture level is mostly on no-MMU (Cortex-M), have not
> worked so far seriously on MMU Linux at the ARM architectural level,
> though used to go through ARM ARM v7-AR at times.
Thanks for offering to help, it's very much appreciated. Let me know how
it goes and if you have any more detailed questions.
> i have a few 32-bit ARM Cortex-A (A5, A8 & A9) boards, maximum RAM 1G,
> none have LPAE, seems i have to buy one for this purpose.
I would recommend starting in a qemu emulated system on a PC host,
you can just set it to emulate a Cortex-A15 or A7, and you can attach
gdb to the qemu instance to see where it crashes (which it inevitably
will).
You can also start by changing the functions in asm/uaccess.h to
use the linear kernel mapping and memcpy(), like the version in
arch/um/kernel/skas/uaccess.c does. This is slow, but will work on
regardless of whether user space is mapped, and you can do a
generic implementation that works on any architecture and put that
into include/asm-generic/uaccess.h.
A second step after that could be to unmap user space when entering
the kernel, without any change in the memory layout, this is still
mostly hardware independent and could easily be done in qemu
or any 32-bit ARM CPU.
Another thing to try early is to move the vmlinux virtual address
from the linear mapping into vmalloc space. This does not require
LPAE either, but it only works on relatively modern platforms that
don't have conflicting fixed mappings there.
If you get that far, I'll happily buy you a Raspberry Pi 4 with 4GB
for further experiments ;-)
That one can run both 64-bit and 32-bit kernels (with LPAE),
so you'd be able to test the limits and not rely on qemu to find
all bugs such as missing TLB flushes or barriers.
Arnd
Hi,
On Tue, Apr 14, 2020 at 09:29:46PM +0200, Arnd Bergmann wrote:
> On Tue, Apr 14, 2020 at 5:17 PM afzal mohammed <[email protected]> wrote:
> > + rmk
[ Forgot to provide context to Russell - this is about implementing
VMSPLIT_4G_4G support for 32-bit ARM as a possible replacement of
highmem ]
> > If no one have yet taken it up, i am interested in doing the work, i
> > will sponsor myself :). i will proceed at a slow pace without derailing
> > my other things normal.
> Thanks for offering to help, it's very much appreciated. Let me know how
> it goes and if you have any more detailed questions.
Okay, i will proceed initially w/ things that can be done using qemu &
available ARM boards. Right now no questions, i will probably be coming
up with questions later.
Regards
afzal
> I would recommend starting in a qemu emulated system on a PC host,
> you can just set it to emulate a Cortex-A15 or A7, and you can attach
> gdb to the qemu instance to see where it crashes (which it inevitably
> will).
>
> You can also start by changing the functions in asm/uaccess.h to
> use the linear kernel mapping and memcpy(), like the version in
> arch/um/kernel/skas/uaccess.c does. This is slow, but will work on
> regardless of whether user space is mapped, and you can do a
> generic implementation that works on any architecture and put that
> into include/asm-generic/uaccess.h.
>
> A second step after that could be to unmap user space when entering
> the kernel, without any change in the memory layout, this is still
> mostly hardware independent and could easily be done in qemu
> or any 32-bit ARM CPU.
>
> Another thing to try early is to move the vmlinux virtual address
> from the linear mapping into vmalloc space. This does not require
> LPAE either, but it only works on relatively modern platforms that
> don't have conflicting fixed mappings there.
>
> If you get that far, I'll happily buy you a Raspberry Pi 4 with 4GB
> for further experiments ;-)
> That one can run both 64-bit and 32-bit kernels (with LPAE),
> so you'd be able to test the limits and not rely on qemu to find
> all bugs such as missing TLB flushes or barriers.
Hi Arnd,
> On Tue, Apr 14, 2020 at 09:29:46PM +0200, Arnd Bergmann wrote:
> > Another thing to try early is to move the vmlinux virtual address
> > from the linear mapping into vmalloc space. This does not require
> > LPAE either, but it only works on relatively modern platforms that
> > don't have conflicting fixed mappings there.
i have started by attempting to move static kernel mapping from lowmem
to vmalloc space. At boot the execution so far has went past assembly
& reached C, to be specific, arm_memblock_init [in setup_arch()],
currently debugging the hang that happens after that point. To make
things easier in the beginning, ARM_PATCH_PHYS_VIRT is disabled &
platform specific PHYS_OFFSET is fed, this is planned to be fixed once
it boots.
[ i will probably start a new thread or hopefully RFC on LAKML ]
Regards
afzal
On Sun, May 3, 2020 at 4:50 PM afzal mohammed <[email protected]> wrote:
>
> Hi Arnd,
>
> > On Tue, Apr 14, 2020 at 09:29:46PM +0200, Arnd Bergmann wrote:
>
> > > Another thing to try early is to move the vmlinux virtual address
> > > from the linear mapping into vmalloc space. This does not require
> > > LPAE either, but it only works on relatively modern platforms that
> > > don't have conflicting fixed mappings there.
>
> i have started by attempting to move static kernel mapping from lowmem
> to vmalloc space. At boot the execution so far has went past assembly
> & reached C, to be specific, arm_memblock_init [in setup_arch()],
> currently debugging the hang that happens after that point.
Ah, good start. Which SoC platform are you running this on? Just making
sure that this won't conflict with static mappings later.
One problem I see immediately in arm_memblock_init() is that it uses
__pa() to convert from virtual address in the linear map to physical,
but now you actually pass an address that is in vmalloc rather than
the linear map. There are certainly more problems like this to come.
> To make things easier in the beginning, ARM_PATCH_PHYS_VIRT is disabled &
> platform specific PHYS_OFFSET is fed, this is planned to be fixed once
> it boots.
>
> [ i will probably start a new thread or hopefully RFC on LAKML ]
Ok, makes sense.
Arnd
[ +linux-arm-kernel
Context: This is regarding VMSPLIT_4G_4G support for 32-bit ARM as a
possible replacement to highmem. For that, initially, it is being
attempted to move static kernel mapping from lowmem to vmalloc space.
in next reply, i will remove everyone/list !ARM related ]
Hi,
On Sun, May 03, 2020 at 10:20:39PM +0200, Arnd Bergmann wrote:
> Which SoC platform are you running this on? Just making
> sure that this won't conflict with static mappings later.
Versatile Express V2P-CA15 on qemu, qemu options include --smp 2 &
2GB memory.
BTW, i could not convince myself why, except for DEBUG_LL, static io
mappings are used.
>
> One problem I see immediately in arm_memblock_init()
Earlier it went past arm_memblock_init(), issue was clearing the page
tables from VMALLOC_START in devicemaps_init() thr' paging_init(),
which was like cutting the sitting branch of the tree.
Now it is crashing at debug_ll_io_init() of devicemap_init(), and
printascii/earlycon was & is being used to debug :). Things are going
wrong when it tries to create mapping for debug_ll. It looks like a
conflict with static mapping, which you mentioned above, at the same
time i am not seeing kernel static mapping in the same virtual
address, need to dig deeper.
Also tried removing DEBUG_LL, there is a deafening silence in the
console ;)
> is that it uses
> __pa() to convert from virtual address in the linear map to physical,
> but now you actually pass an address that is in vmalloc rather than
> the linear map.
__virt_to_phys_nodebug() which does the actual work on __pa() invocation
has been modifed to handle that case (ideas lifted from ARM64's
implementation), though currently it is a hack as below (and applicable
only for ARM_PATCH_PHYS_VIRT disabled case), other hacks being
VMALLOC_OFFSET set to 0 and adjusting vmalloc size.
static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
phys_addr_t __x = (phys_addr_t)x;
if (__x >= 0xf0000000)
return __x - KIMAGE_OFFSET + PHYS_OFFSET;
else
return __x - PAGE_OFFSET + PHYS_OFFSET;
}
Regards
afzal
On Mon, May 4, 2020 at 11:10 AM afzal mohammed <[email protected]> wrote:
>
> [ +linux-arm-kernel
>
> Context: This is regarding VMSPLIT_4G_4G support for 32-bit ARM as a
> possible replacement to highmem. For that, initially, it is being
> attempted to move static kernel mapping from lowmem to vmalloc space.
>
> in next reply, i will remove everyone/list !ARM related ]
>
> Hi,
>
> On Sun, May 03, 2020 at 10:20:39PM +0200, Arnd Bergmann wrote:
>
> > Which SoC platform are you running this on? Just making
> > sure that this won't conflict with static mappings later.
>
> Versatile Express V2P-CA15 on qemu, qemu options include --smp 2 &
> 2GB memory.
Ok
> BTW, i could not convince myself why, except for DEBUG_LL, static io
> mappings are used.
I don't think vexpress uses it, but others have some 'struct map_desc'
instances mostly for historic reasons, e.g. to map some registers that
are needed at very early boot time, or from assembler files.
> > One problem I see immediately in arm_memblock_init()
>
> Earlier it went past arm_memblock_init(), issue was clearing the page
> tables from VMALLOC_START in devicemaps_init() thr' paging_init(),
> which was like cutting the sitting branch of the tree.
>
> Now it is crashing at debug_ll_io_init() of devicemap_init(), and
> printascii/earlycon was & is being used to debug :). Things are going
> wrong when it tries to create mapping for debug_ll. It looks like a
> conflict with static mapping, which you mentioned above, at the same
> time i am not seeing kernel static mapping in the same virtual
> address, need to dig deeper.
>
> Also tried removing DEBUG_LL, there is a deafening silence in the
> console ;)
I don't think there is any other mapping that would conflict with the
DEBUG_LL one, but you may be in a hole where the existing one
is not mapped. IIRC it first gets mapped in __create_page_tables()
in arch/arm/kernel/head.S, and later in debug_ll_io_init(), but if the
old page tables were just discarded, it won't work for a bit.
Using gdb to step through the code in qemu is often more reliable
than printing to the console, at least until you get to the point
when you have registered the real console.
> __virt_to_phys_nodebug() which does the actual work on __pa() invocation
> has been modifed to handle that case (ideas lifted from ARM64's
> implementation), though currently it is a hack as below (and applicable
> only for ARM_PATCH_PHYS_VIRT disabled case), other hacks being
> VMALLOC_OFFSET set to 0 and adjusting vmalloc size.
>
> static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
> {
> phys_addr_t __x = (phys_addr_t)x;
>
> if (__x >= 0xf0000000)
> return __x - KIMAGE_OFFSET + PHYS_OFFSET;
> else
> return __x - PAGE_OFFSET + PHYS_OFFSET;
> }
Ok, makes sense.
Arnd
Hi,
Kernel now boots to prompt w/ static kernel mapping moved to vmalloc
space.
Changes currently done have a couple of platform specific things, this
has to be modified to make it multiplatform friendly (also to be taken
care is ARM_PATCH_PHYS_VIRT case). Module address space has to be
taken care as well.
Logs follows
Regards
afzal
[ 0.000000] Booting Linux on physical CPU 0x0
[ 0.000000] Linux version 5.7.0-rc1-00043-ge8ffd99475b9c (afzal@afzalpc) (gcc version 8.2.0 (GCC_MA), GNU ld (GCC_MA) 2.31.1) #277 SMP Mon May 11 18:16:51 IST 2020
[ 0.000000] CPU: ARMv7 Processor [412fc0f1] revision 1 (ARMv7), cr=10c5387d
[ 0.000000] CPU: div instructions available: patching division code
[ 0.000000] CPU: PIPT / VIPT nonaliasing data cache, PIPT instruction cache
[ 0.000000] OF: fdt: Machine model: V2P-CA15
[ 0.000000] printk: bootconsole [earlycon0] enabled
[ 0.000000] Memory policy: Data cache writealloc
[ 0.000000] efi: UEFI not found.
[ 0.000000] Reserved memory: created DMA memory pool at 0x18000000, size 8 MiB
[ 0.000000] OF: reserved mem: initialized node vram@18000000, compatible id shared-dma-pool
[ 0.000000] percpu: Embedded 20 pages/cpu s49164 r8192 d24564 u81920
[ 0.000000] Built 1 zonelists, mobility grouping on. Total pages: 522751
[ 0.000000] Kernel command line: console=ttyAMA0,115200 rootwait root=/dev/mmcblk0 earlyprintk
[ 0.000000] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes, linear)
[ 0.000000] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes, linear)
[ 0.000000] mem auto-init: stack:off, heap alloc:off, heap free:off
[ 0.000000] Memory: 2057032K/2097148K available (12288K kernel code, 1785K rwdata, 5188K rodata, 2048K init, 403K bss, 40116K reserved, 0K cma-reserved, 1310716K highmem)
[ 0.000000] Virtual kernel memory layout:
[ 0.000000] vector : 0xffff0000 - 0xffff1000 ( 4 kB)
[ 0.000000] fixmap : 0xffc00000 - 0xfff00000 (3072 kB)
[ 0.000000] vmalloc : 0xf1000000 - 0xff800000 ( 232 MB)
[ 0.000000] lowmem : 0xc0000000 - 0xf0000000 ( 768 MB)
[ 0.000000] pkmap : 0xbfe00000 - 0xc0000000 ( 2 MB)
[ 0.000000] modules : 0xbf000000 - 0xbfe00000 ( 14 MB)
[ 0.000000] .text : 0xf1208000 - 0xf1f00000 (13280 kB)
[ 0.000000] .init : 0xf2500000 - 0xf2700000 (2048 kB)
[ 0.000000] .data : 0xf2700000 - 0xf28be558 (1786 kB)
[ 0.000000] .bss : 0xf28be558 - 0xf29231a8 ( 404 kB)
[ 0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
[ 0.000000] rcu: Hierarchical RCU implementation.
[ 0.000000] rcu: RCU event tracing is enabled.
[ 0.000000] rcu: RCU restricting CPUs from NR_CPUS=16 to nr_cpu_ids=2.
[ 0.000000] rcu: RCU calculated value of scheduler-enlistment delay is 10 jiffies.
[ 0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2
[ 0.000000] NR_IRQS: 16, nr_irqs: 16, preallocated irqs: 16
[ 0.000000] random: get_random_bytes called from start_kernel+0x304/0x49c with crng_init=0
[ 0.000311] sched_clock: 32 bits at 24MHz, resolution 41ns, wraps every 89478484971ns
[ 0.006788] clocksource: arm,sp804: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275 ns
[ 0.008479] Failed to initialize '/bus@8000000/motherboard/iofpga@3,00000000/timer@120000': -22
[ 0.013414] arch_timer: cp15 timer(s) running at 62.50MHz (virt).
[ 0.013875] clocksource: arch_sys_counter: mask: 0xffffffffffffff max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns
[ 0.014610] sched_clock: 56 bits at 62MHz, resolution 16ns, wraps every 4398046511096ns
[ 0.015199] Switching to timer-based delay loop, resolution 16ns
[ 0.020168] Console: colour dummy device 80x30
[ 0.022219] Calibrating delay loop (skipped), value calculated using timer frequency.. 125.00 BogoMIPS (lpj=625000)
[ 0.026998] pid_max: default: 32768 minimum: 301
[ 0.028835] Mount-cache hash table entries: 2048 (order: 1, 8192 bytes, linear)
[ 0.029319] Mountpoint-cache hash table entries: 2048 (order: 1, 8192 bytes, linear)
[ 0.044484] CPU: Testing write buffer coherency: ok
[ 0.045452] CPU0: Spectre v2: firmware did not set auxiliary control register IBE bit, system vulnerable
[ 0.057536] /cpus/cpu@0 missing clock-frequency property
[ 0.058065] /cpus/cpu@1 missing clock-frequency property
[ 0.058538] CPU0: thread -1, cpu 0, socket 0, mpidr 80000000
[ 0.066972] Setting up static identity map for 0x80300000 - 0x803000ac
[ 0.074772] rcu: Hierarchical SRCU implementation.
[ 0.083336] EFI services will not be available.
[ 0.085605] smp: Bringing up secondary CPUs ...
[ 0.090454] CPU1: thread -1, cpu 1, socket 0, mpidr 80000001
[ 0.090560] CPU1: Spectre v2: firmware did not set auxiliary control register IBE bit, system vulnerable
[ 0.096711] smp: Brought up 1 node, 2 CPUs
[ 0.097132] SMP: Total of 2 processors activated (250.00 BogoMIPS).
[ 0.097557] CPU: All CPU(s) started in SVC mode.
[ 0.120848] devtmpfs: initialized
[ 0.142952] VFP support v0.3: implementor 41 architecture 4 part 30 variant f rev 0
[ 0.163536] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
[ 0.165733] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
[ 0.171110] pinctrl core: initialized pinctrl subsystem
[ 0.212940] thermal_sys: Registered thermal governor 'step_wise'
[ 0.214839] DMI not present or invalid.
[ 0.224963] NET: Registered protocol family 16
[ 0.229817] DMA: preallocated 256 KiB pool for atomic coherent allocations
[ 0.465443] cpuidle: using governor menu
[ 0.467208] No ATAGs?
[ 0.469397] hw-breakpoint: found 5 (+1 reserved) breakpoint and 4 watchpoint registers.
[ 0.470084] hw-breakpoint: maximum watchpoint size is 8 bytes.
[ 0.485193] Serial: AMBA PL011 UART driver
[ 0.518698] 1c090000.uart: ttyAMA0 at MMIO 0x1c090000 (irq = 33, base_baud = 0) is a PL011 rev1
[ 0.520790] printk: console [ttyAMA0] enabled
[ 0.520790] printk: console [ttyAMA0] enabled
[ 0.521639] printk: bootconsole [earlycon0] disabled
[ 0.521639] printk: bootconsole [earlycon0] disabled
[ 0.529010] 1c0a0000.uart: ttyAMA1 at MMIO 0x1c0a0000 (irq = 34, base_baud = 0) is a PL011 rev1
[ 0.532819] 1c0b0000.uart: ttyAMA2 at MMIO 0x1c0b0000 (irq = 35, base_baud = 0) is a PL011 rev1
[ 0.536532] 1c0c0000.uart: ttyAMA3 at MMIO 0x1c0c0000 (irq = 36, base_baud = 0) is a PL011 rev1
[ 0.540655] OF: amba_device_add() failed (-19) for /bus@8000000/motherboard/iofpga@3,00000000/wdt@f0000
[ 0.551052] OF: amba_device_add() failed (-19) for /memory-controller@2b0a0000
[ 0.552311] OF: amba_device_add() failed (-19) for /memory-controller@7ffd0000
[ 0.553945] OF: amba_device_add() failed (-19) for /dma@7ffb0000
[ 0.662992] AT91: Could not find identification node
[ 0.672222] iommu: Default domain type: Translated
[ 0.674784] vgaarb: loaded
[ 0.679204] SCSI subsystem initialized
[ 0.683940] usbcore: registered new interface driver usbfs
[ 0.684662] usbcore: registered new interface driver hub
[ 0.685326] usbcore: registered new device driver usb
[ 0.689559] pps_core: LinuxPPS API ver. 1 registered
[ 0.689914] pps_core: Software ver. 5.3.6 - Copyright 2005-2007 Rodolfo Giometti <[email protected]>
[ 0.690916] PTP clock support registered
[ 0.691941] EDAC MC: Ver: 3.0.0
[ 0.718934] clocksource: Switched to clocksource arch_sys_counter
[ 1.933420] NET: Registered protocol family 2
[ 1.940976] tcp_listen_portaddr_hash hash table entries: 512 (order: 0, 6144 bytes, linear)
[ 1.941657] TCP established hash table entries: 8192 (order: 3, 32768 bytes, linear)
[ 1.942394] TCP bind hash table entries: 8192 (order: 4, 65536 bytes, linear)
[ 1.943048] TCP: Hash tables configured (established 8192 bind 8192)
[ 1.945065] UDP hash table entries: 512 (order: 2, 16384 bytes, linear)
[ 1.945726] UDP-Lite hash table entries: 512 (order: 2, 16384 bytes, linear)
[ 1.947919] NET: Registered protocol family 1
[ 1.953266] RPC: Registered named UNIX socket transport module.
[ 1.953707] RPC: Registered udp transport module.
[ 1.954027] RPC: Registered tcp transport module.
[ 1.954329] RPC: Registered tcp NFSv4.1 backchannel transport module.
[ 1.954838] PCI: CLS 0 bytes, default 64
[ 1.963271] hw perfevents: no interrupt-affinity property for /pmu, guessing.
[ 1.967592] hw perfevents: enabled with armv7_cortex_a15 PMU driver, 1 counters available
[ 1.973260] Initialise system trusted keyrings
[ 1.976327] workingset: timestamp_bits=30 max_order=19 bucket_order=0
[ 2.004549] squashfs: version 4.0 (2009/01/31) Phillip Lougher
[ 2.009700] NFS: Registering the id_resolver key type
[ 2.010286] Key type id_resolver registered
[ 2.010606] Key type id_legacy registered
[ 2.011037] nfs4filelayout_init: NFSv4 File Layout Driver Registering...
[ 2.011896] ntfs: driver 2.1.32 [Flags: R/O].
[ 2.014565] Key type asymmetric registered
[ 2.014933] Asymmetric key parser 'x509' registered
[ 2.015464] bounce: pool size: 64 pages
[ 2.016042] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 248)
[ 2.016684] io scheduler mq-deadline registered
[ 2.017084] io scheduler kyber registered
[ 2.750925] Serial: 8250/16550 driver, 5 ports, IRQ sharing enabled
[ 2.768719] SuperH (H)SCI(F) driver initialized
[ 2.772315] msm_serial: driver initialized
[ 2.773272] STMicroelectronics ASC driver initialized
[ 2.777155] STM32 USART driver initialized
[ 2.844599] brd: module loaded
[ 2.892007] loop: module loaded
[ 2.910637] physmap-flash 8000000.flash: physmap platform flash device: [mem 0x08000000-0x0bffffff]
[ 2.912893] 8000000.flash: Found 2 x16 devices at 0x0 in 32-bit bank. Manufacturer ID 0x000000 Chip ID 0x000000
[ 2.913992] Intel/Sharp Extended Query Table at 0x0031
[ 2.914976] Using buffer write method
[ 2.918471] physmap-flash 8000000.flash: physmap platform flash device: [mem 0x0c000000-0x0fffffff]
[ 2.919575] 8000000.flash: Found 2 x16 devices at 0x0 in 32-bit bank. Manufacturer ID 0x000000 Chip ID 0x000000
[ 2.920202] Intel/Sharp Extended Query Table at 0x0031
[ 2.920911] Using buffer write method
[ 2.921294] Concatenating MTD devices:
[ 2.921588] (0): "8000000.flash"
[ 2.921811] (1): "8000000.flash"
[ 2.922032] into device "8000000.flash"
[ 2.937990] physmap-flash 14000000.psram: physmap platform flash device: [mem 0x14000000-0x15ffffff]
[ 2.940888] physmap-flash 14000000.psram: map_probe failed
[ 2.967104] libphy: Fixed MDIO Bus: probed
[ 2.972770] CAN device driver interface
[ 2.975546] bgmac_bcma: Broadcom 47xx GBit MAC driver loaded
[ 2.979445] e1000e: Intel(R) PRO/1000 Network Driver - 3.2.6-k
[ 2.979877] e1000e: Copyright(c) 1999 - 2015 Intel Corporation.
[ 2.980464] igb: Intel(R) Gigabit Ethernet Network Driver - version 5.6.0-k
[ 2.980903] igb: Copyright (c) 2007-2014 Intel Corporation.
[ 3.060434] libphy: smsc911x-mdio: probed
[ 3.063490] smsc911x 1a000000.ethernet eth0: MAC Address: 52:54:00:12:34:56
[ 3.073487] pegasus: v0.9.3 (2013/04/25), Pegasus/Pegasus II USB Ethernet driver
[ 3.074171] usbcore: registered new interface driver pegasus
[ 3.074716] usbcore: registered new interface driver asix
[ 3.075172] usbcore: registered new interface driver ax88179_178a
[ 3.075782] usbcore: registered new interface driver cdc_ether
[ 3.076321] usbcore: registered new interface driver smsc75xx
[ 3.076838] usbcore: registered new interface driver smsc95xx
[ 3.077343] usbcore: registered new interface driver net1080
[ 3.077813] usbcore: registered new interface driver cdc_subset
[ 3.078734] usbcore: registered new interface driver zaurus
[ 3.079352] usbcore: registered new interface driver cdc_ncm
[ 3.199902] isp1760 1b000000.usb: bus width: 32, oc: digital
[ 3.201761] isp1760 1b000000.usb: NXP ISP1760 USB Host Controller
[ 3.202801] isp1760 1b000000.usb: new USB bus registered, assigned bus number 1
[ 3.205096] isp1760 1b000000.usb: Scratch test failed.
[ 3.205605] isp1760 1b000000.usb: can't setup: -19
[ 3.206685] isp1760 1b000000.usb: USB bus 1 deregistered
[ 3.209178] ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
[ 3.209645] ehci-pci: EHCI PCI platform driver
[ 3.210241] ehci-platform: EHCI generic platform driver
[ 3.211257] ehci-orion: EHCI orion driver
[ 3.212078] SPEAr-ehci: EHCI SPEAr driver
[ 3.212843] ehci-st: EHCI STMicroelectronics driver
[ 3.213655] ehci-exynos: EHCI Exynos driver
[ 3.214447] ehci-atmel: EHCI Atmel driver
[ 3.215263] tegra-ehci: Tegra EHCI driver
[ 3.216089] ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
[ 3.216616] ohci-pci: OHCI PCI platform driver
[ 3.217214] ohci-platform: OHCI generic platform driver
[ 3.218371] SPEAr-ohci: OHCI SPEAr driver
[ 3.219194] ohci-st: OHCI STMicroelectronics driver
[ 3.220017] ohci-atmel: OHCI Atmel driver
[ 3.222828] usbcore: registered new interface driver usb-storage
[ 3.247467] rtc-pl031 1c170000.rtc: registered as rtc0
[ 3.249524] rtc-pl031 1c170000.rtc: setting system clock to 2020-05-11T12:47:12 UTC (1589201232)
[ 3.255627] i2c /dev entries driver
[ 3.306738] mmci-pl18x 1c050000.mmci: Got CD GPIO
[ 3.307311] mmci-pl18x 1c050000.mmci: Got WP GPIO
[ 3.311750] mmci-pl18x 1c050000.mmci: mmc0: PL181 manf 41 rev0 at 0x1c050000 irq 29,30 (pio)
[ 3.343018] sdhci: Secure Digital Host Controller Interface driver
[ 3.343731] sdhci: Copyright(c) Pierre Ossman
[ 3.354530] Synopsys Designware Multimedia Card Interface Driver
[ 3.360247] sdhci-pltfm: SDHCI platform and OF driver helper
[ 3.380790] input: AT Raw Set 2 keyboard as /devices/platform/bus@8000000/bus@8000000:motherboard/bus@8000000:motherboard:iofpga@3,00000000/1c060000.kmi/serio0/input/input0
[ 3.387235] ledtrig-cpu: registered to indicate activity on CPUs
[ 3.392136] usbcore: registered new interface driver usbhid
[ 3.392544] usbhid: USB HID core driver
[ 3.406774] drop_monitor: Initializing network drop monitor service
[ 3.412034] NET: Registered protocol family 10
[ 3.413410] mmc0: new SD card at address 4567
[ 3.417368] mmcblk0: mmc0:4567 QEMU! 60.0 MiB
[ 3.422359] Segment Routing with IPv6
[ 3.423208] sit: IPv6, IPv4 and MPLS over IPv4 tunneling driver
[ 3.427555] NET: Registered protocol family 17
[ 3.428389] can: controller area network core (rev 20170425 abi 9)
[ 3.429451] NET: Registered protocol family 29
[ 3.429872] can: raw protocol (rev 20170425)
[ 3.430260] can: broadcast manager protocol (rev 20170425 t)
[ 3.430729] can: netlink gateway (rev 20190810) max_hops=1
[ 3.432999] Key type dns_resolver registered
[ 3.433966] ThumbEE CPU extension supported.
[ 3.434334] Registering SWP/SWPB emulation handler
[ 3.436377] Loading compiled-in X.509 certificates
[ 3.467861] uart-pl011 1c090000.uart: no DMA platform data
[ 4.027279] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/bus@8000000/bus@8000000:motherboard/bus@8000000:motherboard:iofpga@3,00000000/1c070000.kmi/serio1/input/input2
[ 4.053481] EXT4-fs (mmcblk0): mounting ext2 file system using the ext4 subsystem
[ 4.056802] random: fast init done
[ 4.095798] EXT4-fs (mmcblk0): mounted filesystem without journal. Opts: (null)
[ 4.098373] VFS: Mounted root (ext2 filesystem) readonly on device 179:0.
[ 4.103806] devtmpfs: mounted
[ 6.986226] Freeing unused kernel memory: 2048K
[ 7.082498] Run /sbin/init as init process
[ 7.141076] process '/bin/busybox' started with executable stack
[ 7.887228] random: crng init done
[ 8.519622] EXT4-fs (mmcblk0): warning: mounting unchecked fs, running e2fsck is recommended
[ 8.523397] EXT4-fs (mmcblk0): re-mounted. Opts: (null)
[ 8.524252] ext2 filesystem being remounted at / supports timestamps until 2038 (0x7fffffff)
Starting syslogd: OK
Starting klogd: OK
Running sysctl: OK
Initializing random number generator... done.
Starting network: [ 10.177578] SMSC LAN911x Internal PHY 1a000000.ethernet-ffffffff:01: attached PHY driver [SMSC LAN911x Internal PHY] (mii_bus:phy_addr=1a000000.ethernet-ffffffff:01, irq=POLL)
[ 10.208699] smsc911x 1a000000.ethernet eth0: SMSC911x/921x identified at 0xf2b30000, IRQ: 26
[ 10.216383] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
udhcpc: started, v1.31.0
udhcpc: sending discover
udhcpc: sending select for 10.0.2.15
udhcpc: lease of 10.0.2.15 obtained, lease time 86400
deleting routers
adding dns 10.0.2.3
OK
Welcome to Buildroot
buildroot login: root
# uname -a
Linux buildroot 5.7.0-rc1-00043-ge8ffd99475b9c #277 SMP Mon May 11 18:16:51 IST 2020 armv7l GNU/Linux
On Mon, May 11, 2020 at 4:21 PM afzal mohammed <[email protected]> wrote:
>
> Hi,
>
> Kernel now boots to prompt w/ static kernel mapping moved to vmalloc
> space.
>
> Changes currently done have a couple of platform specific things, this
> has to be modified to make it multiplatform friendly (also to be taken
> care is ARM_PATCH_PHYS_VIRT case).
Nice!
> Module address space has to be taken care as well.
What do you currently do with the module address space? I suppose the
easiest way is to just always put modules into vmalloc space, as we already
do with CONFIG_ARM_MODULE_PLTS when the special area gets full,
but that could be optimized once the rest works.
Arnd
Hi,
On Mon, May 11, 2020 at 05:29:29PM +0200, Arnd Bergmann wrote:
> What do you currently do with the module address space?
In the current setup, module address space was untouched, i.e. virtual
address difference b/n text & module space is far greater than 32MB, at
least > (2+768+16)MB and modules can't be loaded unless ARM_MODULE_PLTS
is enabled (this was checked now)
> easiest way is to just always put modules into vmalloc space, as we already
> do with CONFIG_ARM_MODULE_PLTS when the special area gets full,
> but that could be optimized once the rest works.
Okay
Regards
afzal
On Tue, May 12, 2020 at 12:48 PM afzal mohammed <[email protected]> wrote:
>
> On Mon, May 11, 2020 at 05:29:29PM +0200, Arnd Bergmann wrote:
>
> > What do you currently do with the module address space?
>
> In the current setup, module address space was untouched, i.e. virtual
> address difference b/n text & module space is far greater than 32MB, at
> least > (2+768+16)MB and modules can't be loaded unless ARM_MODULE_PLTS
> is enabled (this was checked now)
>
> > easiest way is to just always put modules into vmalloc space, as we already
> > do with CONFIG_ARM_MODULE_PLTS when the special area gets full,
> > but that could be optimized once the rest works.
>
> Okay
Any idea which bit you want to try next? Creating a raw_copy_{from,to}_user()
based on get_user_pages()/kmap_atomic()/memcpy() is probably a good
next thing to do. I think it can be done one page at a time with only
checking for
get_fs(), access_ok(), and page permissions, while get_user()/put_user()
need to handle a few more corner cases.
Arnd
Hi,
On Tue, May 12, 2020 at 09:49:59PM +0200, Arnd Bergmann wrote:
> Any idea which bit you want to try next?
My plan has been to next post patches for the static kernel migration
to vmalloc space (currently the code is rigid, taking easy route
wherever possible & not of high quality) as that feature has an
independent existence & adds value by itself. And then start working
on other steps towards VMSPLIT_4G_4G.
Now that you mentioned about other things, i will slowly start those
as well.
> Creating a raw_copy_{from,to}_user()
> based on get_user_pages()/kmap_atomic()/memcpy() is probably a good
> next thing to do. I think it can be done one page at a time with only
> checking for
> get_fs(), access_ok(), and page permissions, while get_user()/put_user()
> need to handle a few more corner cases.
Before starting w/ other things, i would like to align on the high
level design,
My understanding (mostly based on your comments) as follows,
(i currently do not have a firm grip over these things, hope to have
it once started w/ the implementation)
1. SoC w/ LPAE
2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
kmap, fixmap & vectors
3. TTBR0 (low 3768MB) for user space & lowmem (kernel lowmem to have
separate ASID)
4. for user space to/from copy
a. pin user pages
b. kmap user page (can't corresponding lowmem be used instead ?)
c. copy
Main points are as above, right ?, anything missed ?, or anything more
you want to add ?, let me know your opinion.
Regards
afzal
On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> On Tue, May 12, 2020 at 09:49:59PM +0200, Arnd Bergmann wrote:
>
> > Any idea which bit you want to try next?
>
> My plan has been to next post patches for the static kernel migration
> to vmalloc space (currently the code is rigid, taking easy route
> wherever possible & not of high quality) as that feature has an
> independent existence & adds value by itself. And then start working
> on other steps towards VMSPLIT_4G_4G.
>
> Now that you mentioned about other things, i will slowly start those
> as well.
Sounds good.
> > Creating a raw_copy_{from,to}_user()
> > based on get_user_pages()/kmap_atomic()/memcpy() is probably a good
> > next thing to do. I think it can be done one page at a time with only
> > checking for
> > get_fs(), access_ok(), and page permissions, while get_user()/put_user()
> > need to handle a few more corner cases.
>
> Before starting w/ other things, i would like to align on the high
> level design,
>
> My understanding (mostly based on your comments) as follows,
> (i currently do not have a firm grip over these things, hope to have
> it once started w/ the implementation)
>
> 1. SoC w/ LPAE
> 2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
> kmap, fixmap & vectors
Right, these kind of go together because pre-LPAE cannot do the
same TTBR1 split, and they more frequently have conflicting
static mappings.
It's clearly possible to do something very similar for older chips
(v6 or v7 without LPAE, possibly even v5), it just gets harder
while providing less benefit.
> 3. TTBR0 (low 3768MB) for user space & lowmem (kernel lowmem to have
> separate ASID)
Right. This could in theory become a boot-time decision, using
a larger TTBR1 on machines that want more vmalloc space, but a
hardcoded 3840/256 split is likely the best compromise of all the
constraints.
> 4. for user space to/from copy
> a. pin user pages
> b. kmap user page (can't corresponding lowmem be used instead ?)
> c. copy
Right, this is probably the simplest and most generic implementation,
it can even be in an architecture-independent lib/uaccess-generic
file.
These are the trade-offs I see:
- we can have optimizations for get_user/put_user or small copy_from_user
based on ttbr0 switching, but large copies should probably use the pinned
page approach anyway, and it's easier to only have one method to
start with, and then measure the overhead before starting to optimize.
- In the long run, there is no need for kmap()/kmap_atomic() after
highmem gets removed from the kernel, but for the next few years
we should still assume that highmem can be used, in order to support
systems like the 8GB highbank, armadaxp, keystone2 or virtual
machines. For lowmem pages (i.e. all pages when highmem is
disabled), kmap_atomic() falls back to page_address() anyway,
so there is no much overhead.
Arnd
Hi,
On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> > 1. SoC w/ LPAE
> > 2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
> > kmap, fixmap & vectors
> Right, these kind of go together because pre-LPAE cannot do the
> same TTBR1 split, and they more frequently have conflicting
> static mappings.
>
> It's clearly possible to do something very similar for older chips
> (v6 or v7 without LPAE, possibly even v5), it just gets harder
> while providing less benefit.
Yes, lets have it only for LPAE
> > 3. TTBR0 (low 3768MB) for user space & lowmem (kernel lowmem to have
> hardcoded 3840/256 split is likely the best compromise of all the
hmm,i swallowed 72MB ;)
> > 4. for user space to/from copy
> > a. pin user pages
> > b. kmap user page (can't corresponding lowmem be used instead ?)
> - In the long run, there is no need for kmap()/kmap_atomic() after
> highmem gets removed from the kernel, but for the next few years
> we should still assume that highmem can be used, in order to support
> systems like the 8GB highbank, armadaxp, keystone2 or virtual
> machines. For lowmem pages (i.e. all pages when highmem is
> disabled), kmap_atomic() falls back to page_address() anyway,
> so there is no much overhead.
Here i have some confusion - iiuc, VMSPLIT_4G_4G is meant to help
platforms having RAM > 768M and <= 4GB disable high memory and still
be able to access full RAM, so high memory shouldn't come into picture,
right ?. And for the above platforms it can continue current VMPSLIT
option (the default 3G/1G), no ?, as VMSPLIT_4G_4G can't help complete
8G to be accessible from lowmem.
So if we make VMSPLIT_4G_4G, depends on !HIGH_MEMORY (w/ mention of
caveat in Kconfig help that this is meant for platforms w/ <=4GB), then
we can do copy_{from,to}_user the same way currently do, and no need to
do the user page pinning & kmap, right ?
Only problem i see is Kernel compiled w/ VMSPLIT_4G_4G not suitable
for >4GB machines, but anyway iiuc, it is was not meant for those
machines. And it is not going to affect our current multiplatform
setup as LPAE is not defined in multi_v7.
Regards
afzal
Hi,
On Thu, May 14, 2020 at 07:05:45PM +0530, afzal mohammed wrote:
> So if we make VMSPLIT_4G_4G, depends on !HIGH_MEMORY (w/ mention of
> caveat in Kconfig help that this is meant for platforms w/ <=4GB), then
> we can do copy_{from,to}_user the same way currently do, and no need to
> do the user page pinning & kmap, right ?
i think user page pinning is still required, but kmap can be avoided
by using lowmem corresponding to that page, right ?, or am i
completely wrong ?
Regards
afzal
On Thu, May 14, 2020 at 3:35 PM afzal mohammed <[email protected]> wrote:
> On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> > On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> > > 4. for user space to/from copy
> > > a. pin user pages
> > > b. kmap user page (can't corresponding lowmem be used instead ?)
>
> > - In the long run, there is no need for kmap()/kmap_atomic() after
> > highmem gets removed from the kernel, but for the next few years
> > we should still assume that highmem can be used, in order to support
> > systems like the 8GB highbank, armadaxp, keystone2 or virtual
> > machines. For lowmem pages (i.e. all pages when highmem is
> > disabled), kmap_atomic() falls back to page_address() anyway,
> > so there is no much overhead.
>
> Here i have some confusion - iiuc, VMSPLIT_4G_4G is meant to help
> platforms having RAM > 768M and <= 4GB disable high memory and still
> be able to access full RAM, so high memory shouldn't come into picture,
> right ?. And for the above platforms it can continue current VMPSLIT
> option (the default 3G/1G), no ?, as VMSPLIT_4G_4G can't help complete
> 8G to be accessible from lowmem.
>
> So if we make VMSPLIT_4G_4G, depends on !HIGH_MEMORY (w/ mention of
> caveat in Kconfig help that this is meant for platforms w/ <=4GB), then
> we can do copy_{from,to}_user the same way currently do, and no need to
> do the user page pinning & kmap, right ?
No, that doesn't work: the current copy_from_user() relies on the user
address space being a subset of the kernel address space, so it doesn't
have to walk the page tables but just access the pointer and use
the .text.fixup/__ex_table trick to trap any accesses to pages with the
wrong permission or no backing.
> ...
> i think user page pinning is still required, but kmap can be avoided
> by using lowmem corresponding to that page, right ?, or am i
> completely wrong ?
As long as one does not actually use highmem on a particular
machine, kmap_atomic() is much cheaper than the get_user_pages()
that is needed anyway:
static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
preempt_disable();
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
return kmap_atomic_high_prot(page, prot);
}
#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
The preempt_disable()/pagefault_disable() are just adding to
variables that are usually on the local L1 cache, and the
page_address() is another multiply+add that you need in any
case. In kernels that disable highmem, the expensive code path
(kmap_atomic_high_prot) goes away entirely.
As a micro-optimization, one could use page_address() directly
instead of kmap_atomic() when highmem is disabled, but
I doubt it makes much of a difference.
> Only problem i see is Kernel compiled w/ VMSPLIT_4G_4G not suitable
> for >4GB machines, but anyway iiuc, it is was not meant for those
> machines. And it is not going to affect our current multiplatform
> setup as LPAE is not defined in multi_v7.
That was what I original thought as well, but I'd now prefer to
allow highmem to coexist with vmsplit-4g-4g:
Typical distros currently offer two kernels, with and without LPAE,
and they probably don't want to add a third one for LPAE with
either highmem or vmsplit-4g-4g. Having extra user address
space and more lowmem is both going to help users that
still have 8GB configurations.
If we want to limit the number of combinations, I'd prefer making
vmsplit-4g-4g mandatory for all LPAE kernels and it is stable
enough.
Arnd
On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> > On Tue, May 12, 2020 at 09:49:59PM +0200, Arnd Bergmann wrote:
> >
> > > Any idea which bit you want to try next?
> >
> > My plan has been to next post patches for the static kernel migration
> > to vmalloc space (currently the code is rigid, taking easy route
> > wherever possible & not of high quality) as that feature has an
> > independent existence & adds value by itself. And then start working
> > on other steps towards VMSPLIT_4G_4G.
> >
> > Now that you mentioned about other things, i will slowly start those
> > as well.
>
> Sounds good.
>
> > > Creating a raw_copy_{from,to}_user()
> > > based on get_user_pages()/kmap_atomic()/memcpy() is probably a good
> > > next thing to do. I think it can be done one page at a time with only
> > > checking for
> > > get_fs(), access_ok(), and page permissions, while get_user()/put_user()
> > > need to handle a few more corner cases.
> >
> > Before starting w/ other things, i would like to align on the high
> > level design,
> >
> > My understanding (mostly based on your comments) as follows,
> > (i currently do not have a firm grip over these things, hope to have
> > it once started w/ the implementation)
> >
> > 1. SoC w/ LPAE
> > 2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
> > kmap, fixmap & vectors
>
> Right, these kind of go together because pre-LPAE cannot do the
> same TTBR1 split, and they more frequently have conflicting
> static mappings.
>
> It's clearly possible to do something very similar for older chips
> (v6 or v7 without LPAE, possibly even v5), it just gets harder
> while providing less benefit.
Forget about doing this for anything without a PIPT cache - or you're
going to end up having to flush the data cache each time you enter or
exit the kernel.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 10.2Mbps down 587kbps up
On Thu, May 14, 2020 at 6:25 PM Russell King - ARM Linux admin
<[email protected]> wrote:
> On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> > On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> > It's clearly possible to do something very similar for older chips
> > (v6 or v7 without LPAE, possibly even v5), it just gets harder
> > while providing less benefit.
>
> Forget about doing this for anything without a PIPT cache - or you're
> going to end up having to flush the data cache each time you enter or
> exit the kernel.
Right, let's forget I said anything about v5 or earlier ;-)
I expected the non-aliasing VIPT caches to work the same as PIPT, can
you clarify if there is something to be aware of for those? I see that some
ARMv8 chips and most ARMv6 chips (not OMAP2 and Realview) are
of that kind, and at we clearly don't want to break running on ARMv8 at
least.
Anyway my point was that it's best to only do it for LPAE anyway, everything
else being a distraction, as the only non-LPAE SoCs I could find with
support for over 2GB are some of the higher-end i.MX6 versions and the
original highbank.
Arnd
On Thu, May 14, 2020 at 11:12:01PM +0200, Arnd Bergmann wrote:
> On Thu, May 14, 2020 at 6:25 PM Russell King - ARM Linux admin
> <[email protected]> wrote:
> > On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> > > On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
>
> > > It's clearly possible to do something very similar for older chips
> > > (v6 or v7 without LPAE, possibly even v5), it just gets harder
> > > while providing less benefit.
> >
> > Forget about doing this for anything without a PIPT cache - or you're
> > going to end up having to flush the data cache each time you enter or
> > exit the kernel.
>
> Right, let's forget I said anything about v5 or earlier ;-)
>
> I expected the non-aliasing VIPT caches to work the same as PIPT, can
> you clarify if there is something to be aware of for those? I see that some
> ARMv8 chips and most ARMv6 chips (not OMAP2 and Realview) are
> of that kind, and at we clearly don't want to break running on ARMv8 at
> least.
There are some aliasing VIPT implementations on ARMv6, but I don't
remember how common.
> Anyway my point was that it's best to only do it for LPAE anyway, everything
> else being a distraction, as the only non-LPAE SoCs I could find with
> support for over 2GB are some of the higher-end i.MX6 versions and the
> original highbank.
Yep.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 10.2Mbps down 587kbps up
On Fri, May 15, 2020 at 1:40 AM Russell King - ARM Linux admin
<[email protected]> wrote:
> On Thu, May 14, 2020 at 11:12:01PM +0200, Arnd Bergmann wrote:
> > On Thu, May 14, 2020 at 6:25 PM Russell King - ARM Linux admin
> > <[email protected]> wrote:
> > > On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> > > > On Thu, May 14, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> > I expected the non-aliasing VIPT caches to work the same as PIPT, can
> > you clarify if there is something to be aware of for those? I see that some
> > ARMv8 chips and most ARMv6 chips (not OMAP2 and Realview) are
> > of that kind, and at we clearly don't want to break running on ARMv8 at
> > least.
>
> There are some aliasing VIPT implementations on ARMv6, but I don't
> remember how common.
I thought it was only realview-pb and omap2, but it seems there
are more, at least ast2500 is an important example.
I could not find information about integrator-cp and picoxcell.
For reference, this is a list of the arm11 chips we currently support,
with the aliasing dcache ones marked '*':
* ast2500: armv6k, arm1176, ??KB aliasing VIPT
bcm2835: armv6k, arm1176, 16KB non-aliasing VIPT
cns3xxx: armv6k, arm11mpcore, 32kb PIPT
imx3: armv6, arm1136r0, 16kb non-aliasing VIPT
integrator CM1136JF-S core module: arm1136r?, 16kb non-aliasing VIPT
? integrator CTB36 core tile: arm1136r?, ???
ox820: armv6k, arm11mpcore, ??KB PIPT
* omap2: armv6, arm1136r0, 32kb aliasing VIPT
? picoxcell: armv6k, arm11??
* realview-pb1176: armv6k, arm1176, 32kb aliasing VIPT
realview-eb with 1176 core tile: armv6k, 16kb non-aliasing VIPT
realview-eb with 11mpcore core tile: armv6k, 32kb PIPT
s3c64xx: armv6k, arm1176, 16kb non-aliasing VIPT
wm8750: armv6k, arm1176: 16kb non-aliasing VIPT
Arnd
Hi,
On Thu, May 14, 2020 at 05:32:41PM +0200, Arnd Bergmann wrote:
> Typical distros currently offer two kernels, with and without LPAE,
> and they probably don't want to add a third one for LPAE with
> either highmem or vmsplit-4g-4g. Having extra user address
> space and more lowmem is both going to help users that
> still have 8GB configurations.
Okay, so the conclusion i take is,
1. VMSPLIT 4G/4G have to live alongside highmem
2. For user space copy, do pinning followed by kmap
Regards
afzal
On Sat, May 16, 2020 at 8:06 AM afzal mohammed <[email protected]> wrote:
>
> On Thu, May 14, 2020 at 05:32:41PM +0200, Arnd Bergmann wrote:
>
> > Typical distros currently offer two kernels, with and without LPAE,
> > and they probably don't want to add a third one for LPAE with
> > either highmem or vmsplit-4g-4g. Having extra user address
> > space and more lowmem is both going to help users that
> > still have 8GB configurations.
>
> Okay, so the conclusion i take is,
>
> 1. VMSPLIT 4G/4G have to live alongside highmem
> 2. For user space copy, do pinning followed by kmap
Right, though kmap_atomic() should be sufficient here
because it is always a short-lived mapping.
Arnd
Hi,
On Sat, May 16, 2020 at 09:35:57AM +0200, Arnd Bergmann wrote:
> On Sat, May 16, 2020 at 8:06 AM afzal mohammed <[email protected]> wrote:
> > Okay, so the conclusion i take is,
> > 1. VMSPLIT 4G/4G have to live alongside highmem
> > 2. For user space copy, do pinning followed by kmap
> Right, though kmap_atomic() should be sufficient here
> because it is always a short-lived mapping.
get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
to work in principle for user copy.
Verified in a crude way by pointing TTBR0 to a location that has user
pgd's cleared upon entry to copy_to_user() & restoring TTBR0 to
earlier value after user copying was done and ensuring boot.
Meanwhile more testing w/ kernel static mapping in vmalloc space
revealed a major issue, w/ LPAE it was not booting. There were issues
related to pmd handling, w/ !LPAE those issues were not present as pmd
is in effect equivalent to pgd. The issues has been fixed, though now
LPAE boots, but feel a kind of fragile, will probably have to revisit
it.
Regards
afzal
On Sun, Jun 07, 2020 at 06:29:32PM +0530, afzal mohammed wrote:
> Hi,
>
> On Sat, May 16, 2020 at 09:35:57AM +0200, Arnd Bergmann wrote:
> > On Sat, May 16, 2020 at 8:06 AM afzal mohammed <[email protected]> wrote:
>
> > > Okay, so the conclusion i take is,
> > > 1. VMSPLIT 4G/4G have to live alongside highmem
> > > 2. For user space copy, do pinning followed by kmap
>
> > Right, though kmap_atomic() should be sufficient here
> > because it is always a short-lived mapping.
>
> get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
> to work in principle for user copy.
Have you done any performance evaluation of the changes yet? I think
it would be a good idea to keep that in the picture. If there's any
significant regression, then that will need addressing.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC for 0.8m (est. 1762m) line in suburbia: sync at 13.1Mbps down 424kbps up
On Sun, Jun 7, 2020 at 2:59 PM afzal mohammed <[email protected]> wrote:
> On Sat, May 16, 2020 at 09:35:57AM +0200, Arnd Bergmann wrote:
> > On Sat, May 16, 2020 at 8:06 AM afzal mohammed <[email protected]> wrote:
>
> > > Okay, so the conclusion i take is,
> > > 1. VMSPLIT 4G/4G have to live alongside highmem
> > > 2. For user space copy, do pinning followed by kmap
>
> > Right, though kmap_atomic() should be sufficient here
> > because it is always a short-lived mapping.
>
> get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
> to work in principle for user copy.
Nice!
I think you have to use get_user_pages() though instead of
get_user_pages_fast(),
in order to be able to check the permission bits to prevent doing a
copy_to_user()
into read-only mappings.
> Verified in a crude way by pointing TTBR0 to a location that has user
> pgd's cleared upon entry to copy_to_user() & restoring TTBR0 to
> earlier value after user copying was done and ensuring boot.
Do you want me to review the uaccess patch to look for any missing
corner cases, or do you want to do the whole set of user access helpers
first?
> Meanwhile more testing w/ kernel static mapping in vmalloc space
> revealed a major issue, w/ LPAE it was not booting. There were issues
> related to pmd handling, w/ !LPAE those issues were not present as pmd
> is in effect equivalent to pgd. The issues has been fixed, though now
> LPAE boots, but feel a kind of fragile, will probably have to revisit
> it.
Ok.
Arnd
Hi,
[ my previous mail did not make into linux-arm-kernel mailing list,
got a mail saying it has a suspicious header and that it is waiting
moderator approval ]
On Sun, Jun 07, 2020 at 05:11:16PM +0100, Russell King - ARM Linux admin wrote:
> On Sun, Jun 07, 2020 at 06:29:32PM +0530, afzal mohammed wrote:
> > get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
> > to work in principle for user copy.
>
> Have you done any performance evaluation of the changes yet? I think
> it would be a good idea to keep that in the picture. If there's any
> significant regression, then that will need addressing.
Not yet. Yes, i will do the performance evaluation.
i am also worried about the impact on performance as these
[ get_user_pages() or friends, kmap_atomic() ] are additionally
invoked in the copy_{from,to}_user() path now.
Note that this was done on a topic branch for user copy. Changes for
kernel static mapping to vmalloc has not been merged with these.
Also having kernel lowmem w/ a separate asid & switching at kernel
entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
done. Quite a few things remaining to be done to achieve vmsplit 4g/4g
Regards
afzal
Hi,
On Sun, Jun 07, 2020 at 09:26:26PM +0200, Arnd Bergmann wrote:
> I think you have to use get_user_pages() though instead of
> get_user_pages_fast(),
> in order to be able to check the permission bits to prevent doing a
> copy_to_user()
> into read-only mappings.
i was not aware of this, is it documented somewhere ?, afaiu,
difference b/n get_user_pages_fast() & get_user_pages() is that fast
version will try to pin pages w/o acquiring mmap_sem if possible.
> Do you want me to review the uaccess patch to look for any missing
> corner cases, or do you want to do the whole set of user access helpers
> first?
i will cleanup and probably post RFC initially for the changes
handling copy_{from,to}_user() to get feedback.
Regards
afzal
Hi,
On Mon, Jun 08, 2020 at 04:43:57PM +0200, Arnd Bergmann wrote:
> There is another difference: get_user_pages_fast() does not return
> a vm_area_struct pointer, which is where you would check the access
> permissions. I suppose those pointers could not be returned to callers
> that don't already hold the mmap_sem.
Ok, thanks for the details, i need to familiarize better with mm.
Regards
afzal
On Mon, Jun 8, 2020 at 1:18 PM afzal mohammed <[email protected]> wrote:
> On Sun, Jun 07, 2020 at 09:26:26PM +0200, Arnd Bergmann wrote:
>
> > I think you have to use get_user_pages() though instead of
> > get_user_pages_fast(),
> > in order to be able to check the permission bits to prevent doing a
> > copy_to_user()
> > into read-only mappings.
>
> i was not aware of this, is it documented somewhere ?, afaiu,
> difference b/n get_user_pages_fast() & get_user_pages() is that fast
> version will try to pin pages w/o acquiring mmap_sem if possible.
There is another difference: get_user_pages_fast() does not return
a vm_area_struct pointer, which is where you would check the access
permissions. I suppose those pointers could not be returned to callers
that don't already hold the mmap_sem.
> > Do you want me to review the uaccess patch to look for any missing
> > corner cases, or do you want to do the whole set of user access helpers
> > first?
>
> i will cleanup and probably post RFC initially for the changes
> handling copy_{from,to}_user() to get feedback.
Ok.
Arnd
Hi,
On Mon, Jun 08, 2020 at 08:47:27PM +0530, afzal mohammed wrote:
> On Mon, Jun 08, 2020 at 04:43:57PM +0200, Arnd Bergmann wrote:
> > There is another difference: get_user_pages_fast() does not return
> > a vm_area_struct pointer, which is where you would check the access
> > permissions. I suppose those pointers could not be returned to callers
> > that don't already hold the mmap_sem.
>
> Ok, thanks for the details, i need to familiarize better with mm.
i was & now more confused w.r.t checking access permission using
vm_area_struct to deny write on a read only user page.
i have been using get_user_pages_fast() w/ FOLL_WRITE in copy_to_user.
Isn't that sufficient ?, afaiu, get_user_pages_fast() will ensure that
w/ FOLL_WRITE, pte has write permission, else no struct page * is
handed back to the caller.
One of the simplified path which could be relevant in the majority of
the cases that i figured out as follows,
get_user_pages_fast
internal_user_pages_fast
gup_pgd_range [ no mmap_sem acquire path]
gup_p4d_range
gup_pud_range
gup_pmd_range
gup_pte_range
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
[ causes to return NULL page if access violation ]
__gup_longterm_unlocked [ mmap_sem acquire path]
get_user_pages_unlocked
__get_user_pages_locked
__get_user_pages
follow_page_mask
follow_p4d_mask
follow_pud_mask
follow_pmd_mask
follow_page_pte
if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags))
[ causes to return NULL page if access violation ]
As far as i could see none of the get_user_pages() caller are passing
struct vm_area_struct ** to get it populated.
And Ingo's series eons ago didn't either pass it or check permission
using it (it was passing a 'write' arguement, which i believe
corrresponds to FOLL_WRITE)
Am i missing something or wrong in the analysis ?
Regards
afzal
On Tue, Jun 9, 2020 at 2:15 PM afzal mohammed <[email protected]> wrote:
> On Mon, Jun 08, 2020 at 08:47:27PM +0530, afzal mohammed wrote:
> > On Mon, Jun 08, 2020 at 04:43:57PM +0200, Arnd Bergmann wrote:
>
> > > There is another difference: get_user_pages_fast() does not return
> > > a vm_area_struct pointer, which is where you would check the access
> > > permissions. I suppose those pointers could not be returned to callers
> > > that don't already hold the mmap_sem.
> >
> > Ok, thanks for the details, i need to familiarize better with mm.
>
> i was & now more confused w.r.t checking access permission using
> vm_area_struct to deny write on a read only user page.
>
> i have been using get_user_pages_fast() w/ FOLL_WRITE in copy_to_user.
> Isn't that sufficient ?, afaiu, get_user_pages_fast() will ensure that
> w/ FOLL_WRITE, pte has write permission, else no struct page * is
> handed back to the caller.
Ah, that does make a lot of sense, I had just never used that flag
myself so I wasn't aware of this.
> Am i missing something or wrong in the analysis ?
As far as I can tell, you are absolutely right, and get_user_pages_fast()
is the best way to handle this correctly and efficiently.
Arnd
On Mon, Jun 8, 2020 at 1:09 PM afzal mohammed <[email protected]> wrote:
> On Sun, Jun 07, 2020 at 05:11:16PM +0100, Russell King - ARM Linux admin wrote:
> > On Sun, Jun 07, 2020 at 06:29:32PM +0530, afzal mohammed wrote:
>
> > > get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
> > > to work in principle for user copy.
> >
> > Have you done any performance evaluation of the changes yet? I think
> > it would be a good idea to keep that in the picture. If there's any
> > significant regression, then that will need addressing.
>
> Not yet. Yes, i will do the performance evaluation.
>
> i am also worried about the impact on performance as these
> [ get_user_pages() or friends, kmap_atomic() ] are additionally
> invoked in the copy_{from,to}_user() path now.
I am happy to help!
I am anyway working on MMU-related code (KASan) so I need to be on
top of this stuff.
What test is appropriate for this? I would intuitively think hackbench?
> Note that this was done on a topic branch for user copy. Changes for
> kernel static mapping to vmalloc has not been merged with these.
> Also having kernel lowmem w/ a separate asid & switching at kernel
> entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
> done. Quite a few things remaining to be done to achieve vmsplit 4g/4g
I will be very excited to look at patches or a git branch once you have
something you want to show. Also to just understand how you go about
this. I have several elder systems under my
roof so my contribution could hopefully be to help and debug any issues
on these.
Yours,
Linus Walleij
Hi,
On Wed, Jun 10, 2020 at 12:10:21PM +0200, Linus Walleij wrote:
> On Mon, Jun 8, 2020 at 1:09 PM afzal mohammed <[email protected]> wrote:
> > Not yet. Yes, i will do the performance evaluation.
> >
> > i am also worried about the impact on performance as these
> > [ get_user_pages() or friends, kmap_atomic() ] are additionally
> > invoked in the copy_{from,to}_user() path now.
>
> I am happy to help!
Thanks Linus
> I am anyway working on MMU-related code (KASan) so I need to be on
> top of this stuff.
i earlier went thr' KASAN series secretly & did learn a thing or two
from that!
> What test is appropriate for this? I would intuitively think hackbench?
'dd', i think, as you mentioned 'hackbench' i will use that as well.
> > Note that this was done on a topic branch for user copy. Changes for
> > kernel static mapping to vmalloc has not been merged with these.
> > Also having kernel lowmem w/ a separate asid & switching at kernel
> > entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
> > done. Quite a few things remaining to be done to achieve vmsplit 4g/4g
>
> I will be very excited to look at patches or a git branch once you have
> something you want to show. Also to just understand how you go about
> this.
Don't put too much expectation on me, this is more of a learning for
me. For user copy, the baby steps has been posted (To'ed you). On the
static kernel mapping on vmalloc front, i do not want to post the
patches in the current shape, though git-ized, will result in me
getting mercilessly thrashed in public :). Many of the other platforms
would fail and is not multi-platform friendly. i do not yet have a
public git branch, i can send you the (ugly) patches separately, just
let me know.
> I have several elder systems under my roof
i have only a few low RAM & CPU systems, so that is certainly helpful.
> so my contribution could hopefully be to help and debug any issues
If you would like, we can work together, at the same time keep in mind
that me spending time on it would be intermittent & erratic (though i
am trying to keep a consistent, but slow pace) perhaps making it
difficult to coordinate. Or else i will continue the same way & request
your help when required.
For the next 3 weeks, right now, i cannot say whether i would be able
to spend time on it, perhaps might be possible, but only during that
time i will know.
Regards
afzal
Hi Afzal!
On Fri, Jun 12, 2020 at 12:25 PM afzal mohammed <[email protected]> wrote:
> > > Note that this was done on a topic branch for user copy. Changes for
> > > kernel static mapping to vmalloc has not been merged with these.
> > > Also having kernel lowmem w/ a separate asid & switching at kernel
> > > entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
> > > done. Quite a few things remaining to be done to achieve vmsplit 4g/4g
> >
> > I will be very excited to look at patches or a git branch once you have
> > something you want to show. Also to just understand how you go about
> > this.
>
> Don't put too much expectation on me, this is more of a learning for
> me. For user copy, the baby steps has been posted (To'ed you). On the
> static kernel mapping on vmalloc front, i do not want to post the
> patches in the current shape, though git-ized, will result in me
> getting mercilessly thrashed in public :). Many of the other platforms
> would fail and is not multi-platform friendly. i do not yet have a
> public git branch, i can send you the (ugly) patches separately, just
> let me know.
OK I would be very happy to look at it so I can learn a bit about the
hands-on and general approach here. Just send it to this address
directly and I will look!
My interest is mainly to see this progress so any way I can help or
tinker I'm happy to do. You can just incorporate my contributions if
any, I don't care much about code authorship and such things, it
just makes things more complex.
> If you would like, we can work together, at the same time keep in mind
> that me spending time on it would be intermittent & erratic (though i
> am trying to keep a consistent, but slow pace) perhaps making it
> difficult to coordinate. Or else i will continue the same way & request
> your help when required.
>
> For the next 3 weeks, right now, i cannot say whether i would be able
> to spend time on it, perhaps might be possible, but only during that
> time i will know.
I'm going for vacation the next 2 weeks or so, but then it'd be great if
we can start looking at this in-depth!
Thanks!
Linus Walleij
Hi Linus,
On Mon, Jun 15, 2020 at 11:11:04AM +0200, Linus Walleij wrote:
> OK I would be very happy to look at it so I can learn a bit about the
> hands-on and general approach here. Just send it to this address
> directly and I will look!
Have sent it
> > For the next 3 weeks, right now, i cannot say whether i would be able
> > to spend time on it, perhaps might be possible, but only during that
> > time i will know.
>
> I'm going for vacation the next 2 weeks or so, but then it'd be great if
> we can start looking at this in-depth!
Yes for me too
Regards
afzal
On Fri, May 15, 2020 at 5:41 PM Arnd Bergmann <[email protected]> wrote:
[Russell]
> > There are some aliasing VIPT implementations on ARMv6, but I don't
> > remember how common.
>
> I thought it was only realview-pb and omap2, but it seems there
> are more, at least ast2500 is an important example.
>
> I could not find information about integrator-cp and picoxcell.
(...)
> integrator CM1136JF-S core module: arm1136r?, 16kb non-aliasing VIPT
> ? integrator CTB36 core tile: arm1136r?, ???
These do exist, the Integrators have pluggable CPU core modules.
What you do is populate the core module slot on the Integrator CP
with a CM1136.
That said, I think I am the only user of the Integrator/CP actual
hardware. And I don't have this core module. So I think it will be
safe to drop support for that specific VIPT implementation by the
token that if a tree falls in the forest and noone
is there to hear it, it does not make a sound.
As for physically existing VIPT 1136/1176 systems the Ambarella
legacy SoCs that are not upstream is the big consumer of these.
Ambarella's main customer is GoPro cameras and similar
products. I have no idea if they ever upgrade kernels on these
things though, I think not, but it would be great if someone knows
them and can ask whether this is a concern for them. (They
should be working with the community IMO, but is one of those
companies that for some reason do not.)
Yours,
Linus Walleij
On Thu, Jul 30, 2020 at 11:33 AM Linus Walleij <[email protected]> wrote:
> On Fri, May 15, 2020 at 5:41 PM Arnd Bergmann <[email protected]> wrote:
>
> [Russell]
> > > There are some aliasing VIPT implementations on ARMv6, but I don't
> > > remember how common.
> >
> > I thought it was only realview-pb and omap2, but it seems there
> > are more, at least ast2500 is an important example.
> >
> > I could not find information about integrator-cp and picoxcell.
> (...)
> > integrator CM1136JF-S core module: arm1136r?, 16kb non-aliasing VIPT
> > ? integrator CTB36 core tile: arm1136r?, ???
>
> These do exist, the Integrators have pluggable CPU core modules.
> What you do is populate the core module slot on the Integrator CP
> with a CM1136.
Here the question is really what the cache size would be. 16kb
caches are non-aliasing, while 32kb caches would be aliasing.
The particular core revision would tell you whether this is an ARMv6
(1136r0) or ARMv6k (1136r1) implementation.
> That said, I think I am the only user of the Integrator/CP actual
> hardware. And I don't have this core module. So I think it will be
> safe to drop support for that specific VIPT implementation by the
> token that if a tree falls in the forest and noone
> is there to hear it, it does not make a sound.
>
> As for physically existing VIPT 1136/1176 systems the Ambarella
> legacy SoCs that are not upstream is the big consumer of these.
>
> Ambarella's main customer is GoPro cameras and similar
> products. I have no idea if they ever upgrade kernels on these
> things though, I think not, but it would be great if someone knows
> them and can ask whether this is a concern for them. (They
> should be working with the community IMO, but is one of those
> companies that for some reason do not.)
It seems unlikely that there is still enough interest in the old
GoPro chips.
Apparently GoPro Hero3+ from 2013 already used a Cortex-A9
based Ambarella chip, and according to Wikipedia in 2017 they
started making their own SoCs rather using Ambarella's.
I found some source code for both the arm11 version [1] and
the Cortex-A9 based chips, the last update on either of those
that was in 2016. The boot log in [2] shows this is a nonaliasing
cache btw.
Anyway, as I said earlier, as long as AST2500 (or OMAP2) is used,
aliasing dcaches remain a concern for ARMv6-enabled kernels.
Arnd
[1] https://github.com/evilwombat/gopro-linux/tree/master/arch/arm/mach-ambarella
[2] https://www.tapatalk.com/groups/goprouser/hero3-black-firmware-studies-physical-teardown-pho-t10016-s10.html#p58148