There is no guarantee that ACPI tables will be located in RAM linearly
mapped by the kernel. This could be because UEFI placed them below the
kernel image or because mem= places them beyond the reach of the linear
kernel mapping. Even though these tables are outside the linear mapped
RAM, they still need to be accessed as normal memory in order to support
unaligned accesses from ACPI code. In this case, the page_is_ram() test
in acpi_os_ioremap() is not sufficient. Additionally, if the table spans
multiple pages, it may fall partially within the linear map and partially
without. If the table overlaps the end of the linear map, the test for
whether or not to use the existing mapping in ioremap_cache() could lead
to a panic when ACPI code tries to access the part beyond the end of the
linear map. This patch attempts to address these problems.
Cc: Hanjun Guo <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Matt Fleming <[email protected]>
Signed-off-by: Mark Salter <[email protected]>
---
arch/arm64/include/asm/acpi.h | 6 ++-
arch/arm64/include/asm/efi.h | 2 +
arch/arm64/kernel/acpi.c | 13 ++++++
arch/arm64/kernel/efi.c | 95 +++++++++++++++++++++++++++++++++++++++++++
arch/arm64/kernel/setup.c | 1 +
arch/arm64/mm/ioremap.c | 3 +-
6 files changed, 118 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 59c05d8..0849533 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -20,11 +20,15 @@
/* Basic configuration for ACPI */
#ifdef CONFIG_ACPI
+extern int page_is_acpi_ram(unsigned long pfn);
+
/* ACPI table mapping after acpi_gbl_permanent_mmap is set */
static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
acpi_size size)
{
- if (!page_is_ram(phys >> PAGE_SHIFT))
+ unsigned long pfn = phys >> PAGE_SHIFT;
+
+ if (!page_is_ram(pfn) && !page_is_acpi_ram(pfn))
return ioremap(phys, size);
return ioremap_cache(phys, size);
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index ef57220..3d5c12a 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -6,8 +6,10 @@
#ifdef CONFIG_EFI
extern void efi_init(void);
+extern void efi_request_acpi_resources(void);
#else
#define efi_init()
+#define efi_request_acpi_resources()
#endif
#define efi_call_virt(f, ...) \
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 8b83955..2c85ca0 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -343,3 +343,16 @@ void __init acpi_gic_init(void)
early_acpi_os_unmap_memory((char *)table, tbl_size);
}
+
+static int __is_acpi_ram(u64 start, u64 end, void *arg)
+{
+ return 1;
+}
+
+int page_is_acpi_ram(unsigned long pfn)
+{
+ u64 addr = (u64) pfn << PAGE_SHIFT;
+
+ return walk_iomem_res("ACPI RAM", IORESOURCE_MEM | IORESOURCE_BUSY,
+ addr, addr, NULL, __is_acpi_ram) == 1;
+}
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ab21e0d..2d914d7 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -204,6 +204,101 @@ static __init void reserve_regions(void)
set_bit(EFI_MEMMAP, &efi.flags);
}
+#ifdef CONFIG_ACPI
+static void __init __insert_resource(struct resource *res)
+{
+ struct resource *r, *conflict;
+
+ r = alloc_bootmem_low(sizeof(*r));
+ *r = *res;
+
+ /*
+ * When inserting a resource, there will be a conflict
+ * only if the resource being inserted partially overlaps
+ * an existing resource. If the resource being inserted
+ * is entirely within an existing resource, it becomes
+ * a child of that resource with no conflict. So if we
+ * do get a conflict, split the one resource into two
+ * resources: one inside the conflict and one outside.
+ */
+ conflict = insert_resource_conflict(&iomem_resource, r);
+ if (conflict) {
+ struct resource *tmp;
+
+ tmp = alloc_bootmem_low(sizeof(*tmp));
+ *tmp = *r;
+
+ if (r->start >= conflict->start) {
+ r->start = conflict->end + 1;
+ tmp->end = conflict->end;
+ } else {
+ r->end = conflict->start - 1;
+ tmp->start = conflict->start;
+ }
+ insert_resource(&iomem_resource, tmp);
+ insert_resource(&iomem_resource, r);
+ }
+}
+
+/*
+ * Create /proc/iomem resources for any ACPI regions in RAM.
+ */
+void __init efi_request_acpi_resources(void)
+{
+ struct resource res;
+ efi_memory_desc_t *md;
+ u64 start, end, npages;
+ unsigned long mapsize = memmap.map_end - memmap.map;
+ void *map;
+
+ map = early_memremap((resource_size_t)memmap.phys_map, mapsize);
+ if (map == NULL)
+ return;
+ memmap.map = map;
+ memmap.map_end = map + mapsize;
+
+ memset(&res, 0, sizeof(res));
+ res.name = "ACPI RAM";
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ for_each_efi_memory_desc(&memmap, md) {
+ if (!is_normal_ram(md) ||
+ !(md->type == EFI_ACPI_RECLAIM_MEMORY ||
+ md->type == EFI_ACPI_MEMORY_NVS))
+ continue;
+
+ start = md->phys_addr;
+ npages = md->num_pages;
+ memrange_efi_to_native(&start, &npages);
+ end = start + (npages << PAGE_SHIFT) - 1;
+
+ if (res.end == 0) {
+ res.start = start;
+ res.end = end;
+ continue;
+ }
+
+ if (start >= res.start && (start - 1) <= res.end) {
+ /* overlaps (or adjacent to) end of old region */
+ if (end > res.end)
+ res.end = end;
+ } else if (end >= (res.start - 1) && end <= res.end) {
+ /* overlaps (or adjacent to) start of old region */
+ if (start < res.start)
+ res.start = start;
+ } else {
+ __insert_resource(&res);
+ res.start = start;
+ res.end = end;
+ }
+ }
+ if (res.end)
+ __insert_resource(&res);
+
+ early_memunmap(memmap.map, mapsize);
+}
+#endif
+
void __init efi_init(void)
{
struct efi_fdt_params params;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 7475313..a438a0c 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -413,6 +413,7 @@ void __init setup_arch(char **cmdline_p)
of_smp_init_cpus();
#endif
} else {
+ efi_request_acpi_resources();
psci_acpi_init();
acpi_init_cpus();
}
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 01e88c8..d62e701 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL(__iounmap);
void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
{
/* For normal memory we already have a cacheable mapping. */
- if (pfn_valid(__phys_to_pfn(phys_addr)))
+ if (pfn_valid(__phys_to_pfn(phys_addr)) &&
+ pfn_valid(__phys_to_pfn(phys_addr + size - 1)))
return (void __iomem *)__phys_to_virt(phys_addr);
return __ioremap_caller(phys_addr, size, __pgprot(PROT_NORMAL),
--
1.9.3
On 14 May 2015 at 16:22, Mark Salter <[email protected]> wrote:
> There is no guarantee that ACPI tables will be located in RAM linearly
> mapped by the kernel. This could be because UEFI placed them below the
> kernel image or because mem= places them beyond the reach of the linear
> kernel mapping. Even though these tables are outside the linear mapped
> RAM, they still need to be accessed as normal memory in order to support
> unaligned accesses from ACPI code. In this case, the page_is_ram() test
> in acpi_os_ioremap() is not sufficient. Additionally, if the table spans
> multiple pages, it may fall partially within the linear map and partially
> without. If the table overlaps the end of the linear map, the test for
> whether or not to use the existing mapping in ioremap_cache() could lead
> to a panic when ACPI code tries to access the part beyond the end of the
> linear map. This patch attempts to address these problems.
>
I would strongly prefer memblock_remove()'ing all UEFI reserved
regions entirely, and keeping track of which areas are backed my RAM
using a table rather than string matching on the iomem resource table.
When I looked into this a while ago [1], I ended up with a separate
physmem memblock table (borrowed from another arch) that tracks the
regions which are memory while removing all the EFI reserved region
from the 'memory' memblock table. That way, page_is_ram() could be
reimplemented as memblock_is_physmem(), and ioremap_cache() would
always do the right thing automagically.
I kind of held off with this series until the ACPI stuff had landed,
which is obviously the case now. Would you mind having a look at these
patches and sharing your opinion?
[1] http://thread.gmane.org/gmane.linux.kernel.efi/5133
--
Ard.
> Cc: Hanjun Guo <[email protected]>
> Cc: Ard Biesheuvel <[email protected]>
> Cc: Matt Fleming <[email protected]>
> Signed-off-by: Mark Salter <[email protected]>
> ---
> arch/arm64/include/asm/acpi.h | 6 ++-
> arch/arm64/include/asm/efi.h | 2 +
> arch/arm64/kernel/acpi.c | 13 ++++++
> arch/arm64/kernel/efi.c | 95 +++++++++++++++++++++++++++++++++++++++++++
> arch/arm64/kernel/setup.c | 1 +
> arch/arm64/mm/ioremap.c | 3 +-
> 6 files changed, 118 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
> index 59c05d8..0849533 100644
> --- a/arch/arm64/include/asm/acpi.h
> +++ b/arch/arm64/include/asm/acpi.h
> @@ -20,11 +20,15 @@
>
> /* Basic configuration for ACPI */
> #ifdef CONFIG_ACPI
> +extern int page_is_acpi_ram(unsigned long pfn);
> +
> /* ACPI table mapping after acpi_gbl_permanent_mmap is set */
> static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
> acpi_size size)
> {
> - if (!page_is_ram(phys >> PAGE_SHIFT))
> + unsigned long pfn = phys >> PAGE_SHIFT;
> +
> + if (!page_is_ram(pfn) && !page_is_acpi_ram(pfn))
> return ioremap(phys, size);
>
> return ioremap_cache(phys, size);
> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
> index ef57220..3d5c12a 100644
> --- a/arch/arm64/include/asm/efi.h
> +++ b/arch/arm64/include/asm/efi.h
> @@ -6,8 +6,10 @@
>
> #ifdef CONFIG_EFI
> extern void efi_init(void);
> +extern void efi_request_acpi_resources(void);
> #else
> #define efi_init()
> +#define efi_request_acpi_resources()
> #endif
>
> #define efi_call_virt(f, ...) \
> diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
> index 8b83955..2c85ca0 100644
> --- a/arch/arm64/kernel/acpi.c
> +++ b/arch/arm64/kernel/acpi.c
> @@ -343,3 +343,16 @@ void __init acpi_gic_init(void)
>
> early_acpi_os_unmap_memory((char *)table, tbl_size);
> }
> +
> +static int __is_acpi_ram(u64 start, u64 end, void *arg)
> +{
> + return 1;
> +}
> +
> +int page_is_acpi_ram(unsigned long pfn)
> +{
> + u64 addr = (u64) pfn << PAGE_SHIFT;
> +
> + return walk_iomem_res("ACPI RAM", IORESOURCE_MEM | IORESOURCE_BUSY,
> + addr, addr, NULL, __is_acpi_ram) == 1;
> +}
> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
> index ab21e0d..2d914d7 100644
> --- a/arch/arm64/kernel/efi.c
> +++ b/arch/arm64/kernel/efi.c
> @@ -204,6 +204,101 @@ static __init void reserve_regions(void)
> set_bit(EFI_MEMMAP, &efi.flags);
> }
>
> +#ifdef CONFIG_ACPI
> +static void __init __insert_resource(struct resource *res)
> +{
> + struct resource *r, *conflict;
> +
> + r = alloc_bootmem_low(sizeof(*r));
> + *r = *res;
> +
> + /*
> + * When inserting a resource, there will be a conflict
> + * only if the resource being inserted partially overlaps
> + * an existing resource. If the resource being inserted
> + * is entirely within an existing resource, it becomes
> + * a child of that resource with no conflict. So if we
> + * do get a conflict, split the one resource into two
> + * resources: one inside the conflict and one outside.
> + */
> + conflict = insert_resource_conflict(&iomem_resource, r);
> + if (conflict) {
> + struct resource *tmp;
> +
> + tmp = alloc_bootmem_low(sizeof(*tmp));
> + *tmp = *r;
> +
> + if (r->start >= conflict->start) {
> + r->start = conflict->end + 1;
> + tmp->end = conflict->end;
> + } else {
> + r->end = conflict->start - 1;
> + tmp->start = conflict->start;
> + }
> + insert_resource(&iomem_resource, tmp);
> + insert_resource(&iomem_resource, r);
> + }
> +}
> +
> +/*
> + * Create /proc/iomem resources for any ACPI regions in RAM.
> + */
> +void __init efi_request_acpi_resources(void)
> +{
> + struct resource res;
> + efi_memory_desc_t *md;
> + u64 start, end, npages;
> + unsigned long mapsize = memmap.map_end - memmap.map;
> + void *map;
> +
> + map = early_memremap((resource_size_t)memmap.phys_map, mapsize);
> + if (map == NULL)
> + return;
> + memmap.map = map;
> + memmap.map_end = map + mapsize;
> +
> + memset(&res, 0, sizeof(res));
> + res.name = "ACPI RAM";
> + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +
> + for_each_efi_memory_desc(&memmap, md) {
> + if (!is_normal_ram(md) ||
> + !(md->type == EFI_ACPI_RECLAIM_MEMORY ||
> + md->type == EFI_ACPI_MEMORY_NVS))
> + continue;
> +
> + start = md->phys_addr;
> + npages = md->num_pages;
> + memrange_efi_to_native(&start, &npages);
> + end = start + (npages << PAGE_SHIFT) - 1;
> +
> + if (res.end == 0) {
> + res.start = start;
> + res.end = end;
> + continue;
> + }
> +
> + if (start >= res.start && (start - 1) <= res.end) {
> + /* overlaps (or adjacent to) end of old region */
> + if (end > res.end)
> + res.end = end;
> + } else if (end >= (res.start - 1) && end <= res.end) {
> + /* overlaps (or adjacent to) start of old region */
> + if (start < res.start)
> + res.start = start;
> + } else {
> + __insert_resource(&res);
> + res.start = start;
> + res.end = end;
> + }
> + }
> + if (res.end)
> + __insert_resource(&res);
> +
> + early_memunmap(memmap.map, mapsize);
> +}
> +#endif
> +
> void __init efi_init(void)
> {
> struct efi_fdt_params params;
> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
> index 7475313..a438a0c 100644
> --- a/arch/arm64/kernel/setup.c
> +++ b/arch/arm64/kernel/setup.c
> @@ -413,6 +413,7 @@ void __init setup_arch(char **cmdline_p)
> of_smp_init_cpus();
> #endif
> } else {
> + efi_request_acpi_resources();
> psci_acpi_init();
> acpi_init_cpus();
> }
> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
> index 01e88c8..d62e701 100644
> --- a/arch/arm64/mm/ioremap.c
> +++ b/arch/arm64/mm/ioremap.c
> @@ -96,7 +96,8 @@ EXPORT_SYMBOL(__iounmap);
> void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
> {
> /* For normal memory we already have a cacheable mapping. */
> - if (pfn_valid(__phys_to_pfn(phys_addr)))
> + if (pfn_valid(__phys_to_pfn(phys_addr)) &&
> + pfn_valid(__phys_to_pfn(phys_addr + size - 1)))
> return (void __iomem *)__phys_to_virt(phys_addr);
>
> return __ioremap_caller(phys_addr, size, __pgprot(PROT_NORMAL),
> --
> 1.9.3
>
On Thu, 2015-05-14 at 16:50 +0200, Ard Biesheuvel wrote:
> On 14 May 2015 at 16:22, Mark Salter <[email protected]> wrote:
> > There is no guarantee that ACPI tables will be located in RAM linearly
> > mapped by the kernel. This could be because UEFI placed them below the
> > kernel image or because mem= places them beyond the reach of the linear
> > kernel mapping. Even though these tables are outside the linear mapped
> > RAM, they still need to be accessed as normal memory in order to support
> > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> > in acpi_os_ioremap() is not sufficient. Additionally, if the table spans
> > multiple pages, it may fall partially within the linear map and partially
> > without. If the table overlaps the end of the linear map, the test for
> > whether or not to use the existing mapping in ioremap_cache() could lead
> > to a panic when ACPI code tries to access the part beyond the end of the
> > linear map. This patch attempts to address these problems.
> >
>
> I would strongly prefer memblock_remove()'ing all UEFI reserved
> regions entirely, and keeping track of which areas are backed my RAM
> using a table rather than string matching on the iomem resource table.
Why? It isn't a performance issue is it? ACPI caches mappings and out of
the two systems I checked, one called acpi_os_ioremap() 11 times, and
the other only once.
> When I looked into this a while ago [1], I ended up with a separate
> physmem memblock table (borrowed from another arch) that tracks the
> regions which are memory while removing all the EFI reserved region
> from the 'memory' memblock table. That way, page_is_ram() could be
> reimplemented as memblock_is_physmem(), and ioremap_cache() would
> always do the right thing automagically.
>
> I kind of held off with this series until the ACPI stuff had landed,
> which is obviously the case now. Would you mind having a look at these
> patches and sharing your opinion?
> [1] http://thread.gmane.org/gmane.linux.kernel.efi/5133
>
Thanks for reminding me of this. I went back and took a look. Using
memblock physmem would simplify this patch but the problem I see is
that physmem only has 4 regions. I don't think that would be enough,
so how many is enough?
Right now, I'd like to fix the real problem of mem= breaking ACPI
boots ASAP. Some more thought needs to go into using memblock physmem
and if it does turn out to be better or needed for other things like
removing reserved regions from memory, then that patch can clean up
the use of the iomem table.
On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> There is no guarantee that ACPI tables will be located in RAM linearly
> mapped by the kernel. This could be because UEFI placed them below the
> kernel image or because mem= places them beyond the reach of the linear
> kernel mapping. Even though these tables are outside the linear mapped
> RAM, they still need to be accessed as normal memory in order to support
> unaligned accesses from ACPI code. In this case, the page_is_ram() test
> in acpi_os_ioremap() is not sufficient.
And can we not simply add the rest of the RAM to the resource list as
"System RAM" without being part of memblock?
> Additionally, if the table spans multiple pages, it may fall partially
> within the linear map and partially without. If the table overlaps the
> end of the linear map, the test for whether or not to use the existing
> mapping in ioremap_cache() could lead to a panic when ACPI code tries
> to access the part beyond the end of the linear map. This patch
> attempts to address these problems.
That's a problem with ioremap_cache() that should be fixed independently.
Ideally, I'd like to see the ACPI code use different APIs to distinguish
between table access in RAM and device access, so that we don't have to
guess whether the page is RAM or not.
--
Catalin
On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
> On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> > There is no guarantee that ACPI tables will be located in RAM linearly
> > mapped by the kernel. This could be because UEFI placed them below the
> > kernel image or because mem= places them beyond the reach of the linear
> > kernel mapping. Even though these tables are outside the linear mapped
> > RAM, they still need to be accessed as normal memory in order to support
> > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> > in acpi_os_ioremap() is not sufficient.
>
> And can we not simply add the rest of the RAM to the resource list as
> "System RAM" without being part of memblock?
If it is in "System RAM", then it needs a valid pfn and struct page.
Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
>
> > Additionally, if the table spans multiple pages, it may fall partially
> > within the linear map and partially without. If the table overlaps the
> > end of the linear map, the test for whether or not to use the existing
> > mapping in ioremap_cache() could lead to a panic when ACPI code tries
> > to access the part beyond the end of the linear map. This patch
> > attempts to address these problems.
>
> That's a problem with ioremap_cache() that should be fixed independently.
I can submit that separately if you prefer.
>
> Ideally, I'd like to see the ACPI code use different APIs to distinguish
> between table access in RAM and device access, so that we don't have to
> guess whether the page is RAM or not.
>
I don't think the ACPI code has enough info to make that decision, but
I'm not sure honestly.
On Mon, May 18, 2015 at 09:58:45AM -0400, Mark Salter wrote:
> On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
> > On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> > > There is no guarantee that ACPI tables will be located in RAM linearly
> > > mapped by the kernel. This could be because UEFI placed them below the
> > > kernel image or because mem= places them beyond the reach of the linear
> > > kernel mapping. Even though these tables are outside the linear mapped
> > > RAM, they still need to be accessed as normal memory in order to support
> > > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> > > in acpi_os_ioremap() is not sufficient.
> >
> > And can we not simply add the rest of the RAM to the resource list as
> > "System RAM" without being part of memblock?
>
> If it is in "System RAM", then it needs a valid pfn and struct page.
> Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
OK, I had the impression that we could get away with this.
> > > Additionally, if the table spans multiple pages, it may fall partially
> > > within the linear map and partially without. If the table overlaps the
> > > end of the linear map, the test for whether or not to use the existing
> > > mapping in ioremap_cache() could lead to a panic when ACPI code tries
> > > to access the part beyond the end of the linear map. This patch
> > > attempts to address these problems.
> >
> > That's a problem with ioremap_cache() that should be fixed independently.
>
> I can submit that separately if you prefer.
Yes, please.
> > Ideally, I'd like to see the ACPI code use different APIs to distinguish
> > between table access in RAM and device access, so that we don't have to
> > guess whether the page is RAM or not.
>
> I don't think the ACPI code has enough info to make that decision, but
> I'm not sure honestly.
Do we have a guarantee that UEFI tells the kernel about the whole RAM?
--
Catalin
On 18 May 2015 at 18:41, Catalin Marinas <[email protected]> wrote:
> On Mon, May 18, 2015 at 09:58:45AM -0400, Mark Salter wrote:
>> On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
>> > On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
>> > > There is no guarantee that ACPI tables will be located in RAM linearly
>> > > mapped by the kernel. This could be because UEFI placed them below the
>> > > kernel image or because mem= places them beyond the reach of the linear
>> > > kernel mapping. Even though these tables are outside the linear mapped
>> > > RAM, they still need to be accessed as normal memory in order to support
>> > > unaligned accesses from ACPI code. In this case, the page_is_ram() test
>> > > in acpi_os_ioremap() is not sufficient.
>> >
>> > And can we not simply add the rest of the RAM to the resource list as
>> > "System RAM" without being part of memblock?
>>
>> If it is in "System RAM", then it needs a valid pfn and struct page.
>> Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
>
> OK, I had the impression that we could get away with this.
>
>> > > Additionally, if the table spans multiple pages, it may fall partially
>> > > within the linear map and partially without. If the table overlaps the
>> > > end of the linear map, the test for whether or not to use the existing
>> > > mapping in ioremap_cache() could lead to a panic when ACPI code tries
>> > > to access the part beyond the end of the linear map. This patch
>> > > attempts to address these problems.
>> >
>> > That's a problem with ioremap_cache() that should be fixed independently.
>>
>> I can submit that separately if you prefer.
>
> Yes, please.
>
>> > Ideally, I'd like to see the ACPI code use different APIs to distinguish
>> > between table access in RAM and device access, so that we don't have to
>> > guess whether the page is RAM or not.
>>
>> I don't think the ACPI code has enough info to make that decision, but
>> I'm not sure honestly.
>
> Do we have a guarantee that UEFI tells the kernel about the whole RAM?
>
Yes, the UEFI memory map must describe all of RAM, no matter how it is
used. I may also describe some MMIO regions, but typically only
regions that it needs itself to implement the UEFI Runtime Services
(e.g., RTC base address, NOR flash for the variable store)
So we could potentially query the UEFI memory map directly to find out
whether some otherwise unqualified region is backed by RAM or not,
although I'd prefer some intermediate data structure (such as the
physmem memblock table) if we go that route.
--
Ard.
On Mon, May 18, 2015 at 06:49:28PM +0200, Ard Biesheuvel wrote:
> On 18 May 2015 at 18:41, Catalin Marinas <[email protected]> wrote:
> > On Mon, May 18, 2015 at 09:58:45AM -0400, Mark Salter wrote:
> >> On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
> >> > On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> >> > > There is no guarantee that ACPI tables will be located in RAM linearly
> >> > > mapped by the kernel. This could be because UEFI placed them below the
> >> > > kernel image or because mem= places them beyond the reach of the linear
> >> > > kernel mapping. Even though these tables are outside the linear mapped
> >> > > RAM, they still need to be accessed as normal memory in order to support
> >> > > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> >> > > in acpi_os_ioremap() is not sufficient.
> >> >
> >> > And can we not simply add the rest of the RAM to the resource list as
> >> > "System RAM" without being part of memblock?
> >>
> >> If it is in "System RAM", then it needs a valid pfn and struct page.
> >> Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
> >
> > OK, I had the impression that we could get away with this.
> >
> >> > > Additionally, if the table spans multiple pages, it may fall partially
> >> > > within the linear map and partially without. If the table overlaps the
> >> > > end of the linear map, the test for whether or not to use the existing
> >> > > mapping in ioremap_cache() could lead to a panic when ACPI code tries
> >> > > to access the part beyond the end of the linear map. This patch
> >> > > attempts to address these problems.
> >> >
> >> > That's a problem with ioremap_cache() that should be fixed independently.
> >>
> >> I can submit that separately if you prefer.
> >
> > Yes, please.
> >
> >> > Ideally, I'd like to see the ACPI code use different APIs to distinguish
> >> > between table access in RAM and device access, so that we don't have to
> >> > guess whether the page is RAM or not.
> >>
> >> I don't think the ACPI code has enough info to make that decision, but
> >> I'm not sure honestly.
> >
> > Do we have a guarantee that UEFI tells the kernel about the whole RAM?
>
> Yes, the UEFI memory map must describe all of RAM, no matter how it is
> used. I may also describe some MMIO regions, but typically only
> regions that it needs itself to implement the UEFI Runtime Services
> (e.g., RTC base address, NOR flash for the variable store)
>
> So we could potentially query the UEFI memory map directly to find out
> whether some otherwise unqualified region is backed by RAM or not,
> although I'd prefer some intermediate data structure (such as the
> physmem memblock table) if we go that route.
OK, so my preferred options, in this order:
1. Change the core ACPI kernel code to distinguish between mapping I/O
or RAM (could be as simple as acpi_map not using acpi_os_ioremap but
another API). I guess the code knows when it plans to map tables or
I/O registers
2. If the above is not possible, add the extra checks as per Mark's
patch but I would rather call this resource "UEFI RAM" than "ACPI",
it's not really ACPI specific.
--
Catalin
On Fri, 2015-05-22 at 11:34 +0100, Catalin Marinas wrote:
> On Mon, May 18, 2015 at 06:49:28PM +0200, Ard Biesheuvel wrote:
> > On 18 May 2015 at 18:41, Catalin Marinas <[email protected]> wrote:
> > > On Mon, May 18, 2015 at 09:58:45AM -0400, Mark Salter wrote:
> > >> On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
> > >> > On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> > >> > > There is no guarantee that ACPI tables will be located in RAM linearly
> > >> > > mapped by the kernel. This could be because UEFI placed them below the
> > >> > > kernel image or because mem= places them beyond the reach of the linear
> > >> > > kernel mapping. Even though these tables are outside the linear mapped
> > >> > > RAM, they still need to be accessed as normal memory in order to support
> > >> > > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> > >> > > in acpi_os_ioremap() is not sufficient.
> > >> >
> > >> > And can we not simply add the rest of the RAM to the resource list as
> > >> > "System RAM" without being part of memblock?
> > >>
> > >> If it is in "System RAM", then it needs a valid pfn and struct page.
> > >> Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
> > >
> > > OK, I had the impression that we could get away with this.
> > >
> > >> > > Additionally, if the table spans multiple pages, it may fall partially
> > >> > > within the linear map and partially without. If the table overlaps the
> > >> > > end of the linear map, the test for whether or not to use the existing
> > >> > > mapping in ioremap_cache() could lead to a panic when ACPI code tries
> > >> > > to access the part beyond the end of the linear map. This patch
> > >> > > attempts to address these problems.
> > >> >
> > >> > That's a problem with ioremap_cache() that should be fixed independently.
> > >>
> > >> I can submit that separately if you prefer.
> > >
> > > Yes, please.
> > >
> > >> > Ideally, I'd like to see the ACPI code use different APIs to distinguish
> > >> > between table access in RAM and device access, so that we don't have to
> > >> > guess whether the page is RAM or not.
> > >>
> > >> I don't think the ACPI code has enough info to make that decision, but
> > >> I'm not sure honestly.
> > >
> > > Do we have a guarantee that UEFI tells the kernel about the whole RAM?
> >
> > Yes, the UEFI memory map must describe all of RAM, no matter how it is
> > used. I may also describe some MMIO regions, but typically only
> > regions that it needs itself to implement the UEFI Runtime Services
> > (e.g., RTC base address, NOR flash for the variable store)
> >
> > So we could potentially query the UEFI memory map directly to find out
> > whether some otherwise unqualified region is backed by RAM or not,
> > although I'd prefer some intermediate data structure (such as the
> > physmem memblock table) if we go that route.
>
> OK, so my preferred options, in this order:
>
> 1. Change the core ACPI kernel code to distinguish between mapping I/O
> or RAM (could be as simple as acpi_map not using acpi_os_ioremap but
> another API). I guess the code knows when it plans to map tables or
> I/O registers
>
> 2. If the above is not possible, add the extra checks as per Mark's
> patch but I would rather call this resource "UEFI RAM" than "ACPI",
> it's not really ACPI specific.
>
Actually, it is ACPI specific. The patch only registers resources for
EfiACPIReclaimMemory and EfiACPIMemoryNVS regions which are also
marked as cacheable. On x86 these show up in /proc/iomem as
"ACPI Tables" and "ACPI Non-volatile Storage". I used "ACPI RAM" to
avoid having to search for two strings.
On Fri, May 22, 2015 at 08:46:02AM -0400, Mark Salter wrote:
> On Fri, 2015-05-22 at 11:34 +0100, Catalin Marinas wrote:
> > OK, so my preferred options, in this order:
> >
> > 1. Change the core ACPI kernel code to distinguish between mapping I/O
> > or RAM (could be as simple as acpi_map not using acpi_os_ioremap but
> > another API). I guess the code knows when it plans to map tables or
> > I/O registers
> >
> > 2. If the above is not possible, add the extra checks as per Mark's
> > patch but I would rather call this resource "UEFI RAM" than "ACPI",
> > it's not really ACPI specific.
>
> Actually, it is ACPI specific. The patch only registers resources for
> EfiACPIReclaimMemory and EfiACPIMemoryNVS regions which are also
> marked as cacheable. On x86 these show up in /proc/iomem as
> "ACPI Tables" and "ACPI Non-volatile Storage". I used "ACPI RAM" to
> avoid having to search for two strings.
My point is more about UEFI describing the entire RAM while the kernel
command line restricts it via "mem=". In this case, the "System RAM"
resources is reduced as well but it does not necessarily mean that the
rest of the RAM is only used by ACPI.
--
Catalin
On Fri, 2015-05-22 at 13:53 +0100, Catalin Marinas wrote:
> On Fri, May 22, 2015 at 08:46:02AM -0400, Mark Salter wrote:
> > On Fri, 2015-05-22 at 11:34 +0100, Catalin Marinas wrote:
> > > OK, so my preferred options, in this order:
> > >
> > > 1. Change the core ACPI kernel code to distinguish between mapping I/O
> > > or RAM (could be as simple as acpi_map not using acpi_os_ioremap but
> > > another API). I guess the code knows when it plans to map tables or
> > > I/O registers
> > >
> > > 2. If the above is not possible, add the extra checks as per Mark's
> > > patch but I would rather call this resource "UEFI RAM" than "ACPI",
> > > it's not really ACPI specific.
> >
> > Actually, it is ACPI specific. The patch only registers resources for
> > EfiACPIReclaimMemory and EfiACPIMemoryNVS regions which are also
> > marked as cacheable. On x86 these show up in /proc/iomem as
> > "ACPI Tables" and "ACPI Non-volatile Storage". I used "ACPI RAM" to
> > avoid having to search for two strings.
>
> My point is more about UEFI describing the entire RAM while the kernel
> command line restricts it via "mem=". In this case, the "System RAM"
> resources is reduced as well but it does not necessarily mean that the
> rest of the RAM is only used by ACPI.
>
Ah okay. But I'm not sure we want the kernel to access other areas
cut off by mem=. The case I'm dealing with which led to this patch
was a kdump dump-collection kernel. For that kernel, we really don't
want it accessing any general purpose memory outside of its system
ram. For the dump collection, it already uses ioremap_cache to get
to crash-kernel memory. The reason for the iomem resources added
for the UEFI ACPI regions was so the dump-collection kernel could
boot using ACPI.
That being said, I could rework the patch to add all "UEFI RAM" and
that would let us just check that one string for acpi_os_ioremap()
purposes rather than checking "System RAM" and "ACPI RAM".
On Fri, 2015-05-22 at 11:34 +0100, Catalin Marinas wrote:
> On Mon, May 18, 2015 at 06:49:28PM +0200, Ard Biesheuvel wrote:
> > On 18 May 2015 at 18:41, Catalin Marinas <[email protected]> wrote:
> > > On Mon, May 18, 2015 at 09:58:45AM -0400, Mark Salter wrote:
> > >> On Mon, 2015-05-18 at 12:11 +0100, Catalin Marinas wrote:
> > >> > On Thu, May 14, 2015 at 10:22:53AM -0400, Mark Salter wrote:
> > >> > > There is no guarantee that ACPI tables will be located in RAM linearly
> > >> > > mapped by the kernel. This could be because UEFI placed them below the
> > >> > > kernel image or because mem= places them beyond the reach of the linear
> > >> > > kernel mapping. Even though these tables are outside the linear mapped
> > >> > > RAM, they still need to be accessed as normal memory in order to support
> > >> > > unaligned accesses from ACPI code. In this case, the page_is_ram() test
> > >> > > in acpi_os_ioremap() is not sufficient.
> > >> >
> > >> > And can we not simply add the rest of the RAM to the resource list as
> > >> > "System RAM" without being part of memblock?
> > >>
> > >> If it is in "System RAM", then it needs a valid pfn and struct page.
> > >> Parts of the kernel expect that (page_is_ram(), memory hotplug, etc).
> > >
> > > OK, I had the impression that we could get away with this.
> > >
> > >> > > Additionally, if the table spans multiple pages, it may fall partially
> > >> > > within the linear map and partially without. If the table overlaps the
> > >> > > end of the linear map, the test for whether or not to use the existing
> > >> > > mapping in ioremap_cache() could lead to a panic when ACPI code tries
> > >> > > to access the part beyond the end of the linear map. This patch
> > >> > > attempts to address these problems.
> > >> >
> > >> > That's a problem with ioremap_cache() that should be fixed independently.
> > >>
> > >> I can submit that separately if you prefer.
> > >
> > > Yes, please.
> > >
> > >> > Ideally, I'd like to see the ACPI code use different APIs to distinguish
> > >> > between table access in RAM and device access, so that we don't have to
> > >> > guess whether the page is RAM or not.
> > >>
> > >> I don't think the ACPI code has enough info to make that decision, but
> > >> I'm not sure honestly.
> > >
> > > Do we have a guarantee that UEFI tells the kernel about the whole RAM?
> >
> > Yes, the UEFI memory map must describe all of RAM, no matter how it is
> > used. I may also describe some MMIO regions, but typically only
> > regions that it needs itself to implement the UEFI Runtime Services
> > (e.g., RTC base address, NOR flash for the variable store)
> >
> > So we could potentially query the UEFI memory map directly to find out
> > whether some otherwise unqualified region is backed by RAM or not,
> > although I'd prefer some intermediate data structure (such as the
> > physmem memblock table) if we go that route.
>
> OK, so my preferred options, in this order:
>
> 1. Change the core ACPI kernel code to distinguish between mapping I/O
> or RAM (could be as simple as acpi_map not using acpi_os_ioremap but
> another API). I guess the code knows when it plans to map tables or
> I/O registers
>From my reading of the code, ACPI distinguishes a number of different
address spaces and allows for the installation of handlers to access
those various spaces. The problem with ACPI_ADR_SPACE_SYSTEM_MEMORY is
that it covers both mmio and RAM and ACPI code makes no distinction
between the two. x86 code uses page_is_ram() to decide whether to use
kmap() or ioremap(). ia64 always uses ioremap() which sorts out the
caching internally. For arm64, we ended up with using page_is_ram() to
decide between ioremap_cache() an ioremap(). I don't think generic ACPI
code has the ability to do any better than that. It really is an
architecture decision based on address space attributes which ACPI
doesn't know about.
>
> 2. If the above is not possible, add the extra checks as per Mark's
> patch but I would rather call this resource "UEFI RAM" than "ACPI",
> it's not really ACPI specific.
>