Kukjin reported oops happen while he change min_free_kbytes
http://www.spinics.net/lists/arm-kernel/msg92894.html
It happen by memory map on sparsemem.
The system has a memory map following as.
section 0 section 1 section 2
0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
SECTION_SIZE_BITS 28(256M)
It means section 0 is an incompletely filled section.
Nontheless, current pfn_valid of sparsemem checks pfn loosely.
It checks only mem_section's validation but ARM can free mem_map on hole
to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
validation check. It's not what we want.
We can match section size to smallest valid size.(ex, above case, 16M)
But Russell doesn't like it due to mem_section's memory overhead with different
configuration(ex, 512K section).
I tried to add valid pfn range in mem_section but everyone doesn't like it
due to size overhead. This patch is suggested by KAMEZAWA-san.
I just fixed compile error and change some naming.
This patch registers address of mem_section to memmap itself's page struct's
pg->private field. This means the page is used for memmap of the section.
Otherwise, the page is used for other purpose and memmap has a hole.
This patch is based on mmotm-2010-07-01-12-19.
Signed-off-by: Minchan Kim <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Reported-by: Kukjin Kim <[email protected]>
---
arch/arm/mm/init.c | 9 ++++++++-
include/linux/mmzone.h | 21 ++++++++++++++++++++-
mm/Kconfig | 5 +++++
mm/sparse.c | 41 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index cfe4c5e..4586f40 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
arch_adjust_zones(zone_size, zhole_size);
free_area_init_node(0, zone_size, min, zhole_size);
+
+ for_each_bank(i, mi) {
+ mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
+ bank_pfn_end(&mi->bank[i]), true);
+ }
}
#ifndef CONFIG_SPARSEMEM
@@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
* If there are free pages between these,
* free the section of the memmap array.
*/
- if (pg < pgend)
+ if (pg < pgend) {
+ mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
free_bootmem(pg, pgend - pg);
+ }
}
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ed9c45..2ed9728 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -15,6 +15,7 @@
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
+#include <linux/mm_types.h>
#include <generated/bounds.h>
#include <asm/atomic.h>
#include <asm/page.h>
@@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
return __nr_to_section(pfn_to_section_nr(pfn));
}
+void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
+
+#ifdef CONFIG_SPARSEMEM_HAS_HOLE
+static inline int page_valid(struct mem_section *ms, unsigned long pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ struct page *__pg = virt_to_page(page);
+ return __pg->private == (unsigned long)ms;
+}
+#else
+static inline int page_valid(struct mem_section *ms, unsigned long pfn)
+{
+ return 1;
+}
+#endif
+
static inline int pfn_valid(unsigned long pfn)
{
+ struct mem_section *ms;
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
- return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
+ ms = __nr_to_section(pfn_to_section_nr(pfn));
+ return valid_section(ms) && page_valid(ms, pfn);
}
static inline int pfn_present(unsigned long pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index 527136b..959ac1d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+config SPARSEMEM_HAS_HOLE
+ bool "allow holes in sparsemem's memmap"
+ depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
+ default n
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219..76d5012 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,6 +615,47 @@ void __init sparse_init(void)
free_bootmem(__pa(usemap_map), size);
}
+#ifdef CONFIG_SPARSEMEM_HAS_HOLE
+/*
+ * Fill memmap's pg->private with a pointer to mem_section.
+ * pfn_valid() will check this later. (see include/linux/mmzone.h)
+ * Evenry arch should call
+ * mark_memmap_hole(start, end, true) # for all allocated mem_map
+ * and, after that,
+ * mark_memmap_hole(start, end, false) # for all holes in mem_map.
+ * please see usage in ARM.
+ */
+void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
+{
+ struct mem_section *ms;
+ unsigned long pos, next;
+ struct page *pg;
+ void *memmap, *mapend;
+
+ for (pos = start; pos < end; pos = next) {
+ next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
+ ms = __pfn_to_section(pos);
+ if (!valid_section(ms))
+ continue;
+
+ for (memmap = (void*)pfn_to_page(pos),
+ /* The last page in section */
+ mapend = pfn_to_page(next-1);
+ memmap < mapend; memmap += PAGE_SIZE) {
+ pg = virt_to_page(memmap);
+ if (valid)
+ pg->private = (unsigned long)ms;
+ else
+ pg->private = 0;
+ }
+ }
+}
+#else
+void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
+{
+}
+#endif
+
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
--
1.7.0.5
Hi Minchan,
Please see the OneDRAM spec. it's OneDRAM memory usage.
Actually memory size is 80MiB + 16MiB at AP side and it's used 80MiB
for dedicated AP.
The shared 16MiB used between AP and CP. So we also use the 16MiB.
Thank you,
Kyungmin Park
On Sun, Jul 18, 2010 at 7:18 PM, Minchan Kim <[email protected]> wrote:
> Kukjin reported oops happen while he change min_free_kbytes
> http://www.spinics.net/lists/arm-kernel/msg92894.html
> It happen by memory map on sparsemem.
>
> The system has a memory map following as.
> ? ? section 0 ? ? ? ? ? ? section 1 ? ? ? ? ? ? ?section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> It means section 0 is an incompletely filled section.
> Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> It checks only mem_section's validation but ARM can free mem_map on hole
> to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> validation check. It's not what we want.
>
> We can match section size to smallest valid size.(ex, above case, 16M)
> But Russell doesn't like it due to mem_section's memory overhead with different
> configuration(ex, 512K section).
>
> I tried to add valid pfn range in mem_section but everyone doesn't like it
> due to size overhead. This patch is suggested by KAMEZAWA-san.
> I just fixed compile error and change some naming.
>
> This patch registers address of mem_section to memmap itself's page struct's
> pg->private field. This means the page is used for memmap of the section.
> Otherwise, the page is used for other purpose and memmap has a hole.
>
> This patch is based on mmotm-2010-07-01-12-19.
>
> Signed-off-by: Minchan Kim <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> Reported-by: Kukjin Kim <[email protected]>
> ---
> ?arch/arm/mm/init.c ? ? | ? ?9 ++++++++-
> ?include/linux/mmzone.h | ? 21 ++++++++++++++++++++-
> ?mm/Kconfig ? ? ? ? ? ? | ? ?5 +++++
> ?mm/sparse.c ? ? ? ? ? ?| ? 41 +++++++++++++++++++++++++++++++++++++++++
> ?4 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> index cfe4c5e..4586f40 100644
> --- a/arch/arm/mm/init.c
> +++ b/arch/arm/mm/init.c
> @@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
> ? ? ? ?arch_adjust_zones(zone_size, zhole_size);
>
> ? ? ? ?free_area_init_node(0, zone_size, min, zhole_size);
> +
> + ? ? ? for_each_bank(i, mi) {
> + ? ? ? ? ? ? ? mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bank_pfn_end(&mi->bank[i]), true);
> + ? ? ? }
> ?}
>
> ?#ifndef CONFIG_SPARSEMEM
> @@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
> ? ? ? ? * If there are free pages between these,
> ? ? ? ? * free the section of the memmap array.
> ? ? ? ? */
> - ? ? ? if (pg < pgend)
> + ? ? ? if (pg < pgend) {
> + ? ? ? ? ? ? ? mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
> ? ? ? ? ? ? ? ?free_bootmem(pg, pgend - pg);
> + ? ? ? }
> ?}
>
> ?/*
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9ed9c45..2ed9728 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -15,6 +15,7 @@
> ?#include <linux/seqlock.h>
> ?#include <linux/nodemask.h>
> ?#include <linux/pageblock-flags.h>
> +#include <linux/mm_types.h>
> ?#include <generated/bounds.h>
> ?#include <asm/atomic.h>
> ?#include <asm/page.h>
> @@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
> ? ? ? ?return __nr_to_section(pfn_to_section_nr(pfn));
> ?}
>
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
> +
> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + ? ? ? struct page *page = pfn_to_page(pfn);
> + ? ? ? struct page *__pg = virt_to_page(page);
> + ? ? ? return __pg->private == (unsigned long)ms;
> +}
> +#else
> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + ? ? ? return 1;
> +}
> +#endif
> +
> ?static inline int pfn_valid(unsigned long pfn)
> ?{
> + ? ? ? struct mem_section *ms;
> ? ? ? ?if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> ? ? ? ? ? ? ? ?return 0;
> - ? ? ? return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> + ? ? ? ms = __nr_to_section(pfn_to_section_nr(pfn));
> + ? ? ? return valid_section(ms) && page_valid(ms, pfn);
> ?}
>
> ?static inline int pfn_present(unsigned long pfn)
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 527136b..959ac1d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
> ? ? ? ? pfn_to_page and page_to_pfn operations. ?This is the most
> ? ? ? ? efficient option when sufficient kernel resources are available.
>
> +config SPARSEMEM_HAS_HOLE
> + ? ? ? bool "allow holes in sparsemem's memmap"
> + ? ? ? depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
> + ? ? ? default n
> +
> ?# eventually, we can have this option just 'select SPARSEMEM'
> ?config MEMORY_HOTPLUG
> ? ? ? ?bool "Allow for memory hot-add"
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 95ac219..76d5012 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -615,6 +615,47 @@ void __init sparse_init(void)
> ? ? ? ?free_bootmem(__pa(usemap_map), size);
> ?}
>
> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> +/*
> + * Fill memmap's pg->private with a pointer to mem_section.
> + * pfn_valid() will check this later. (see include/linux/mmzone.h)
> + * Evenry arch should call
> + * ? ? mark_memmap_hole(start, end, true) # for all allocated mem_map
> + * ? ? and, after that,
> + * ? ? mark_memmap_hole(start, end, false) # for all holes in mem_map.
> + * please see usage in ARM.
> + */
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> + ? ? ? struct mem_section *ms;
> + ? ? ? unsigned long pos, next;
> + ? ? ? struct page *pg;
> + ? ? ? void *memmap, *mapend;
> +
> + ? ? ? for (pos = start; pos < end; pos = next) {
> + ? ? ? ? ? ? ? next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
> + ? ? ? ? ? ? ? ms = __pfn_to_section(pos);
> + ? ? ? ? ? ? ? if (!valid_section(ms))
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> +
> + ? ? ? ? ? ? ? for (memmap = (void*)pfn_to_page(pos),
> + ? ? ? ? ? ? ? ? ? ? ? /* The last page in section */
> + ? ? ? ? ? ? ? ? ? ? ? mapend = pfn_to_page(next-1);
> + ? ? ? ? ? ? ? ? ? ? ? memmap < mapend; memmap += PAGE_SIZE) {
> + ? ? ? ? ? ? ? ? ? ? ? pg = virt_to_page(memmap);
> + ? ? ? ? ? ? ? ? ? ? ? if (valid)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? pg->private = (unsigned long)ms;
> + ? ? ? ? ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? pg->private = 0;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +}
> +#else
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> +}
> +#endif
> +
> ?#ifdef CONFIG_MEMORY_HOTPLUG
> ?#ifdef CONFIG_SPARSEMEM_VMEMMAP
> ?static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
> --
> 1.7.0.5
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Hi Kyoungmin,
On Sun, Jul 18, 2010 at 08:53:25PM +0900, Kyungmin Park wrote:
> Hi Minchan,
>
> Please see the OneDRAM spec. it's OneDRAM memory usage.
> Actually memory size is 80MiB + 16MiB at AP side and it's used 80MiB
> for dedicated AP.
> The shared 16MiB used between AP and CP. So we also use the 16MiB.
It's not only s5pv210 but general problem of memmap hole
on ARM's sparsemem.
It doesn't matter with 16M or 80M.
The thing is that section size is greater than physical memory's groups.
Current sparsemen aren't designed to have memmap hole but ARM makes holes
to save memory space. So we should solve it by not SECTION_SIZE but more
fundamental solution.
I think this patch suggests it.
--
Kind regards,
Minchan Kim
On Sun, 18 Jul 2010 19:18:31 +0900
Minchan Kim <[email protected]> wrote:
> Kukjin reported oops happen while he change min_free_kbytes
> http://www.spinics.net/lists/arm-kernel/msg92894.html
> It happen by memory map on sparsemem.
>
> The system has a memory map following as.
> section 0 section 1 section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> It means section 0 is an incompletely filled section.
> Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> It checks only mem_section's validation but ARM can free mem_map on hole
> to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> validation check. It's not what we want.
>
> We can match section size to smallest valid size.(ex, above case, 16M)
> But Russell doesn't like it due to mem_section's memory overhead with different
> configuration(ex, 512K section).
>
> I tried to add valid pfn range in mem_section but everyone doesn't like it
> due to size overhead. This patch is suggested by KAMEZAWA-san.
> I just fixed compile error and change some naming.
>
> This patch registers address of mem_section to memmap itself's page struct's
> pg->private field. This means the page is used for memmap of the section.
> Otherwise, the page is used for other purpose and memmap has a hole.
>
> This patch is based on mmotm-2010-07-01-12-19.
>
> Signed-off-by: Minchan Kim <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> Reported-by: Kukjin Kim <[email protected]>
Thank you for working on this. I myself like this solution.
I think ARM guys can make this default later (after rc period ?)
Thanks,
-Kame
Hi,
On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> Kukjin reported oops happen while he change min_free_kbytes
> http://www.spinics.net/lists/arm-kernel/msg92894.html
> It happen by memory map on sparsemem.
>
> The system has a memory map following as.
> section 0 section 1 section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> It means section 0 is an incompletely filled section.
> Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> It checks only mem_section's validation but ARM can free mem_map on hole
> to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> validation check. It's not what we want.
>
> We can match section size to smallest valid size.(ex, above case, 16M)
> But Russell doesn't like it due to mem_section's memory overhead with different
> configuration(ex, 512K section).
>
> I tried to add valid pfn range in mem_section but everyone doesn't like it
> due to size overhead. This patch is suggested by KAMEZAWA-san.
> I just fixed compile error and change some naming.
I did not like it, because it messes up the whole concept of a
section.
But most importantly, we already have a crutch for ARM in place,
namely memmap_valid_within(). Looking at Kukjin's bug report,
wouldn't it be enough to use that check in
setup_zone_migrate_reserve()?
Your approach makes every pfn_valid() more expensive, although the
extensive checks are not not needed everywhere (check the comment
above memmap_valid_within): vm_normal_page() for example can probably
assume that a PTE won't point to a hole within the memory map.
OTOH, if the ARM people do not care, we could probably go with your
approach, encode it all into pfn_valid(), and also get rid of
memmap_valid_within() completely. But I would prefer doing a bugfix
first and such a conceptual change in a different patch, would you
agree?
Kukjin, does the appended patch also fix your problem?
Hannes
---
From: Johannes Weiner <[email protected]>
Subject: mm: check mem_map backing in setup_zone_migrate_reserve
Kukjin encountered kernel oopsen when changing
/proc/sys/vm/min_free_kbytes. The problem is that his sparse memory
layout on ARM is the following:
section 0 section 1 section 2
0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
SECTION_SIZE_BITS 28(256M)
where there is a memory hole at the end of section 0.
Since section 0 has _some_ memory, pfn_valid() will return true for
all PFNs in this section. But ARM releases the mem_map pages of this
hole and pfn_valid() alone is not enough anymore to ensure there is a
valid page struct behind a PFN.
We acknowledged that ARM does this already and have a function to
double-check for mem_map in cases where we do PFN range walks (as
opposed to coming from a page table entry, which should not point to a
memory hole in the first place e.g.).
setup_zone_migrate_reserve() contains one such range walk which does
not have the extra check and was also the cause of the oopsen Kukjin
encountered.
This patch adds the needed memmap_valid_within() check.
Reported-by: Kukjin Kim <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b0b629..cb6d6d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3168,6 +3168,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
page = pfn_to_page(pfn);
+ /* Watch out for holes in the memory map */
+ if (!memmap_valid_within(pfn, page, zone))
+ continue;
+
/* Watch out for overlapping nodes */
if (page_to_nid(page) != zone_to_nid(zone))
continue;
Johannes Weiner wrote:
>
> Hi,
>
> On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> > Kukjin reported oops happen while he change min_free_kbytes
> > http://www.spinics.net/lists/arm-kernel/msg92894.html
> > It happen by memory map on sparsemem.
> >
> > The system has a memory map following as.
> > section 0 section 1 section 2
> > 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> > SECTION_SIZE_BITS 28(256M)
> >
> > It means section 0 is an incompletely filled section.
> > Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> > It checks only mem_section's validation but ARM can free mem_map on hole
> > to save memory space. So in above case, pfn on 0x25000000 can pass
> pfn_valid's
> > validation check. It's not what we want.
> >
> > We can match section size to smallest valid size.(ex, above case, 16M)
> > But Russell doesn't like it due to mem_section's memory overhead with
different
> > configuration(ex, 512K section).
> >
> > I tried to add valid pfn range in mem_section but everyone doesn't like
it
> > due to size overhead. This patch is suggested by KAMEZAWA-san.
> > I just fixed compile error and change some naming.
>
> I did not like it, because it messes up the whole concept of a
> section.
>
> But most importantly, we already have a crutch for ARM in place,
> namely memmap_valid_within(). Looking at Kukjin's bug report,
> wouldn't it be enough to use that check in
> setup_zone_migrate_reserve()?
>
> Your approach makes every pfn_valid() more expensive, although the
> extensive checks are not not needed everywhere (check the comment
> above memmap_valid_within): vm_normal_page() for example can probably
> assume that a PTE won't point to a hole within the memory map.
>
> OTOH, if the ARM people do not care, we could probably go with your
> approach, encode it all into pfn_valid(), and also get rid of
> memmap_valid_within() completely. But I would prefer doing a bugfix
> first and such a conceptual change in a different patch, would you
> agree?
>
> Kukjin, does the appended patch also fix your problem?
>
Yes, did not happen problem with your patch.
But already Minchan requested test on the board with same patch.
And you can find it in following thread about that.
http://lists.infradead.org/pipermail/linux-arm-kernel/2010-July/020199.html
I'm not sure which approach is better to us right now.
Hmm...
Thanks.
Best regards,
Kgene.
--
Kukjin Kim <[email protected]>, Senior Engineer,
SW Solution Development Team, Samsung Electronics Co., Ltd.
> Hannes
>
> ---
> From: Johannes Weiner <[email protected]>
> Subject: mm: check mem_map backing in setup_zone_migrate_reserve
>
> Kukjin encountered kernel oopsen when changing
> /proc/sys/vm/min_free_kbytes. The problem is that his sparse memory
> layout on ARM is the following:
>
> section 0 section 1 section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> where there is a memory hole at the end of section 0.
>
> Since section 0 has _some_ memory, pfn_valid() will return true for
> all PFNs in this section. But ARM releases the mem_map pages of this
> hole and pfn_valid() alone is not enough anymore to ensure there is a
> valid page struct behind a PFN.
>
> We acknowledged that ARM does this already and have a function to
> double-check for mem_map in cases where we do PFN range walks (as
> opposed to coming from a page table entry, which should not point to a
> memory hole in the first place e.g.).
>
> setup_zone_migrate_reserve() contains one such range walk which does
> not have the extra check and was also the cause of the oopsen Kukjin
> encountered.
>
> This patch adds the needed memmap_valid_within() check.
>
> Reported-by: Kukjin Kim <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 0b0b629..cb6d6d3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3168,6 +3168,10 @@ static void setup_zone_migrate_reserve(struct zone
> *zone)
> continue;
> page = pfn_to_page(pfn);
>
> + /* Watch out for holes in the memory map */
> + if (!memmap_valid_within(pfn, page, zone))
> + continue;
> +
> /* Watch out for overlapping nodes */
> if (page_to_nid(page) != zone_to_nid(zone))
> continue;
On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> Kukjin reported oops happen while he change min_free_kbytes
> http://www.spinics.net/lists/arm-kernel/msg92894.html
> It happen by memory map on sparsemem.
>
First off, thanks for working on this.
> The system has a memory map following as.
> section 0 section 1 section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> It means section 0 is an incompletely filled section.
> Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> It checks only mem_section's validation but ARM can free mem_map on hole
> to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> validation check. It's not what we want.
>
> We can match section size to smallest valid size.(ex, above case, 16M)
> But Russell doesn't like it due to mem_section's memory overhead with different
> configuration(ex, 512K section).
>
> I tried to add valid pfn range in mem_section but everyone doesn't like it
> due to size overhead.
Also IIRC, it was vunerable to a hole being punched in the middle of the
section.
> This patch is suggested by KAMEZAWA-san.
> I just fixed compile error and change some naming.
>
> This patch registers address of mem_section to memmap itself's page struct's
> pg->private field. This means the page is used for memmap of the section.
> Otherwise, the page is used for other purpose and memmap has a hole.
>
> This patch is based on mmotm-2010-07-01-12-19.
>
> Signed-off-by: Minchan Kim <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> Reported-by: Kukjin Kim <[email protected]>
> ---
> arch/arm/mm/init.c | 9 ++++++++-
> include/linux/mmzone.h | 21 ++++++++++++++++++++-
> mm/Kconfig | 5 +++++
> mm/sparse.c | 41 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> index cfe4c5e..4586f40 100644
> --- a/arch/arm/mm/init.c
> +++ b/arch/arm/mm/init.c
> @@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
> arch_adjust_zones(zone_size, zhole_size);
>
> free_area_init_node(0, zone_size, min, zhole_size);
> +
> + for_each_bank(i, mi) {
> + mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
> + bank_pfn_end(&mi->bank[i]), true);
> + }
> }
Why do we need to mark banks both valid and invalid? Is it not enough to
just mark the holes in free_memmap() and assume it is valid otherwise?
>
> #ifndef CONFIG_SPARSEMEM
> @@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
> * If there are free pages between these,
> * free the section of the memmap array.
> */
> - if (pg < pgend)
> + if (pg < pgend) {
> + mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
> free_bootmem(pg, pgend - pg);
> + }
> }
>
> /*
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9ed9c45..2ed9728 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -15,6 +15,7 @@
> #include <linux/seqlock.h>
> #include <linux/nodemask.h>
> #include <linux/pageblock-flags.h>
> +#include <linux/mm_types.h>
> #include <generated/bounds.h>
> #include <asm/atomic.h>
> #include <asm/page.h>
> @@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
> return __nr_to_section(pfn_to_section_nr(pfn));
> }
>
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
> +
The naming is confusing with the "valid" parameter.
What's a "valid hole"? I can see that one being a cause of head
scratching in the future :)
> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
Why not use CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ?
> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + struct page *page = pfn_to_page(pfn);
> + struct page *__pg = virt_to_page(page);
> + return __pg->private == (unsigned long)ms;
> +}
> +#else
> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + return 1;
> +}
> +#endif
> +
> static inline int pfn_valid(unsigned long pfn)
> {
> + struct mem_section *ms;
> if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> return 0;
> - return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> + ms = __nr_to_section(pfn_to_section_nr(pfn));
> + return valid_section(ms) && page_valid(ms, pfn);
> }
So it appears here that we unconditionally check page_valid() but we know
which sections had holes in them at the time we called mark_memmap_hole(). Can
the sections with holes be tagged so that only some sections need to call
page_valid()? As it is, ARM will be taking a an performance hit just in case
the section has holes but it should only need to take a performance hit
on the corner case where a section is not fully populated.
>
> static inline int pfn_present(unsigned long pfn)
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 527136b..959ac1d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
> pfn_to_page and page_to_pfn operations. This is the most
> efficient option when sufficient kernel resources are available.
>
> +config SPARSEMEM_HAS_HOLE
> + bool "allow holes in sparsemem's memmap"
> + depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
> + default n
> +
> # eventually, we can have this option just 'select SPARSEMEM'
> config MEMORY_HOTPLUG
> bool "Allow for memory hot-add"
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 95ac219..76d5012 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -615,6 +615,47 @@ void __init sparse_init(void)
> free_bootmem(__pa(usemap_map), size);
> }
>
> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> +/*
> + * Fill memmap's pg->private with a pointer to mem_section.
> + * pfn_valid() will check this later. (see include/linux/mmzone.h)
> + * Evenry arch should call
> + * mark_memmap_hole(start, end, true) # for all allocated mem_map
> + * and, after that,
> + * mark_memmap_hole(start, end, false) # for all holes in mem_map.
> + * please see usage in ARM.
> + */
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> + struct mem_section *ms;
> + unsigned long pos, next;
> + struct page *pg;
> + void *memmap, *mapend;
> +
> + for (pos = start; pos < end; pos = next) {
> + next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
> + ms = __pfn_to_section(pos);
> + if (!valid_section(ms))
> + continue;
> +
> + for (memmap = (void*)pfn_to_page(pos),
> + /* The last page in section */
> + mapend = pfn_to_page(next-1);
> + memmap < mapend; memmap += PAGE_SIZE) {
> + pg = virt_to_page(memmap);
> + if (valid)
> + pg->private = (unsigned long)ms;
> + else
> + pg->private = 0;
> + }
> + }
> +}
> +#else
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> +}
> +#endif
> +
The patch should also delete memmap_valid_within() and replace it with a
call to pfn_valid_within(). The reason memmap_valid_within() existed was
because sparsemem had holes punched in it but I'd rather not see use of
that function grow.
> #ifdef CONFIG_MEMORY_HOTPLUG
> #ifdef CONFIG_SPARSEMEM_VMEMMAP
> static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
> --
> 1.7.0.5
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Tue, Jul 20, 2010 at 12:15:58PM +0200, Johannes Weiner wrote:
> Hi,
>
> On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> > Kukjin reported oops happen while he change min_free_kbytes
> > http://www.spinics.net/lists/arm-kernel/msg92894.html
> > It happen by memory map on sparsemem.
> >
> > The system has a memory map following as.
> > section 0 section 1 section 2
> > 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> > SECTION_SIZE_BITS 28(256M)
> >
> > It means section 0 is an incompletely filled section.
> > Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> > It checks only mem_section's validation but ARM can free mem_map on hole
> > to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> > validation check. It's not what we want.
> >
> > We can match section size to smallest valid size.(ex, above case, 16M)
> > But Russell doesn't like it due to mem_section's memory overhead with different
> > configuration(ex, 512K section).
> >
> > I tried to add valid pfn range in mem_section but everyone doesn't like it
> > due to size overhead. This patch is suggested by KAMEZAWA-san.
> > I just fixed compile error and change some naming.
>
> I did not like it, because it messes up the whole concept of a
> section.
Yes. but ARM already have broken it.
So we can't ignore it.
>
> But most importantly, we already have a crutch for ARM in place,
> namely memmap_valid_within(). Looking at Kukjin's bug report,
> wouldn't it be enough to use that check in
> setup_zone_migrate_reserve()?
I did it.
But I think it's not a fundamental solution.
It would make new bad rule which whole pfn walker should call
memmap_valid_within.
I just greped "grep -nRH 'start_pfn' mm/'.
If we add it in setup_zone_migration_reserve, look at kmemleak(kmemleak_scan)
compaction(isolate_migratepages) and so on. I am not sure how many there are.
I doubt they have a same problem in setup_zone_migrate_pages.
Should we add memmap_valid_within whenever whole pfn walker does?
>
> Your approach makes every pfn_valid() more expensive, although the
> extensive checks are not not needed everywhere (check the comment
> above memmap_valid_within): vm_normal_page() for example can probably
> assume that a PTE won't point to a hole within the memory map.
I agree. But I think it's trade-off of architecture have memmap hole.
They want to use such model which don't meet sparsemem's disign
so that it's cost they have to pay.
In fact, All we mm guys don't want to use such model, but ARM has been
used such model, so we can't ignore them. So I want to care them but doesn't
affect non-hole architecure of memmap.
In terms of such point, this patch doesn't have a overhead
in non-hole architecture and it doesn't make new rule as I said above.
Also, hole architecture developers don't need to override pfn_valid to detect
hole pfn range.
>
> OTOH, if the ARM people do not care, we could probably go with your
> approach, encode it all into pfn_valid(), and also get rid of
> memmap_valid_within() completely. But I would prefer doing a bugfix
> first and such a conceptual change in a different patch, would you
> agree?
Hmm, I am not sure. memmap_valid_within problem can happen in only sparemem?
AFAIR, the problem can happen in punched hole(not either side hole) of FLATMEM?
If it is right, maybe we should expand this patch
to CONFIG_ARCH_HAS_HOLES_MEMORYMODEL not CONFIG_SPARSEMEM.
And I think this patch just checks validation of memmap not page's itself
validiation. If one page in memmap has mixed(valid or non-valid) struct page
descriptors, it can't identify it. So pfn walker need to check PageReserved
in addition to pfn_valid. But as I review above example, some cases
doesn't check it. but it's a another story we have to fix.
Thanks for careful review, Hannes. :)
--
Kind regards,
Minchan Kim
On Tue, Jul 20, 2010 at 04:28:09PM +0100, Mel Gorman wrote:
> On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> > Kukjin reported oops happen while he change min_free_kbytes
> > http://www.spinics.net/lists/arm-kernel/msg92894.html
> > It happen by memory map on sparsemem.
> >
>
> First off, thanks for working on this.
>
> > The system has a memory map following as.
> > section 0 section 1 section 2
> > 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> > SECTION_SIZE_BITS 28(256M)
> >
> > It means section 0 is an incompletely filled section.
> > Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> > It checks only mem_section's validation but ARM can free mem_map on hole
> > to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> > validation check. It's not what we want.
> >
> > We can match section size to smallest valid size.(ex, above case, 16M)
> > But Russell doesn't like it due to mem_section's memory overhead with different
> > configuration(ex, 512K section).
> >
> > I tried to add valid pfn range in mem_section but everyone doesn't like it
> > due to size overhead.
>
> Also IIRC, it was vunerable to a hole being punched in the middle of the
> section.
>
> > This patch is suggested by KAMEZAWA-san.
> > I just fixed compile error and change some naming.
> >
> > This patch registers address of mem_section to memmap itself's page struct's
> > pg->private field. This means the page is used for memmap of the section.
> > Otherwise, the page is used for other purpose and memmap has a hole.
> >
> > This patch is based on mmotm-2010-07-01-12-19.
> >
> > Signed-off-by: Minchan Kim <[email protected]>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > Reported-by: Kukjin Kim <[email protected]>
> > ---
> > arch/arm/mm/init.c | 9 ++++++++-
> > include/linux/mmzone.h | 21 ++++++++++++++++++++-
> > mm/Kconfig | 5 +++++
> > mm/sparse.c | 41 +++++++++++++++++++++++++++++++++++++++++
> > 4 files changed, 74 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> > index cfe4c5e..4586f40 100644
> > --- a/arch/arm/mm/init.c
> > +++ b/arch/arm/mm/init.c
> > @@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
> > arch_adjust_zones(zone_size, zhole_size);
> >
> > free_area_init_node(0, zone_size, min, zhole_size);
> > +
> > + for_each_bank(i, mi) {
> > + mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
> > + bank_pfn_end(&mi->bank[i]), true);
> > + }
> > }
>
> Why do we need to mark banks both valid and invalid? Is it not enough to
> just mark the holes in free_memmap() and assume it is valid otherwise?
>
Good point.
If we can make sure pg->private is zero, we can fix it.
I will check it.
> >
> > #ifndef CONFIG_SPARSEMEM
> > @@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
> > * If there are free pages between these,
> > * free the section of the memmap array.
> > */
> > - if (pg < pgend)
> > + if (pg < pgend) {
> > + mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
> > free_bootmem(pg, pgend - pg);
> > + }
> > }
> >
> > /*
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 9ed9c45..2ed9728 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -15,6 +15,7 @@
> > #include <linux/seqlock.h>
> > #include <linux/nodemask.h>
> > #include <linux/pageblock-flags.h>
> > +#include <linux/mm_types.h>
> > #include <generated/bounds.h>
> > #include <asm/atomic.h>
> > #include <asm/page.h>
> > @@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
> > return __nr_to_section(pfn_to_section_nr(pfn));
> > }
> >
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
> > +
>
> The naming is confusing with the "valid" parameter.
>
> What's a "valid hole"? I can see that one being a cause of head
> scratching in the future :)
Okay. If we call it in only free_memmap, we can change naming following as.
ex) mark_invalid_memmap(start, end);
Will change.
>
> > +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
>
> Why not use CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ?
As I mentioned my previous mail(reply of Hannes), if the problen can happen
in FLATMEM, CONFIG_ARCH_HAS_HOLES_MEMORYMODEL is right.
Please confirm this, Mel. :)
>
> > +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> > +{
> > + struct page *page = pfn_to_page(pfn);
> > + struct page *__pg = virt_to_page(page);
> > + return __pg->private == (unsigned long)ms;
> > +}
> > +#else
> > +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> > +{
> > + return 1;
> > +}
> > +#endif
> > +
> > static inline int pfn_valid(unsigned long pfn)
> > {
> > + struct mem_section *ms;
> > if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> > return 0;
> > - return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> > + ms = __nr_to_section(pfn_to_section_nr(pfn));
> > + return valid_section(ms) && page_valid(ms, pfn);
> > }
>
> So it appears here that we unconditionally check page_valid() but we know
> which sections had holes in them at the time we called mark_memmap_hole(). Can
> the sections with holes be tagged so that only some sections need to call
> page_valid()? As it is, ARM will be taking a an performance hit just in case
> the section has holes but it should only need to take a performance hit
> on the corner case where a section is not fully populated.
In fact, I tried it with SECTION_MAP_LAST_BIT as you suggested.
But stucked. That's because now we can use 2 bit of section_mem_map.
And we have used 2 bit all with different meaning.
I have a dumb question. Is there any case section have SECTION_MARKED_PRESENT
but don't have SECTION_HAS_MEM_MAP except section populated time?
I mean can't we remove SECTION_MARKED_PRESENT totally?
If it is, we can the 1 bit for marking hole section.
If it isn't, I think it seems we can use lower bit of pageblock_flags.
although it's not a good.
Thanks for careful review, Mel.
>
> >
> > static inline int pfn_present(unsigned long pfn)
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 527136b..959ac1d 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
> > pfn_to_page and page_to_pfn operations. This is the most
> > efficient option when sufficient kernel resources are available.
> >
> > +config SPARSEMEM_HAS_HOLE
> > + bool "allow holes in sparsemem's memmap"
> > + depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
> > + default n
> > +
> > # eventually, we can have this option just 'select SPARSEMEM'
> > config MEMORY_HOTPLUG
> > bool "Allow for memory hot-add"
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index 95ac219..76d5012 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -615,6 +615,47 @@ void __init sparse_init(void)
> > free_bootmem(__pa(usemap_map), size);
> > }
> >
> > +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> > +/*
> > + * Fill memmap's pg->private with a pointer to mem_section.
> > + * pfn_valid() will check this later. (see include/linux/mmzone.h)
> > + * Evenry arch should call
> > + * mark_memmap_hole(start, end, true) # for all allocated mem_map
> > + * and, after that,
> > + * mark_memmap_hole(start, end, false) # for all holes in mem_map.
> > + * please see usage in ARM.
> > + */
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> > +{
> > + struct mem_section *ms;
> > + unsigned long pos, next;
> > + struct page *pg;
> > + void *memmap, *mapend;
> > +
> > + for (pos = start; pos < end; pos = next) {
> > + next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
> > + ms = __pfn_to_section(pos);
> > + if (!valid_section(ms))
> > + continue;
> > +
> > + for (memmap = (void*)pfn_to_page(pos),
> > + /* The last page in section */
> > + mapend = pfn_to_page(next-1);
> > + memmap < mapend; memmap += PAGE_SIZE) {
> > + pg = virt_to_page(memmap);
> > + if (valid)
> > + pg->private = (unsigned long)ms;
> > + else
> > + pg->private = 0;
> > + }
> > + }
> > +}
> > +#else
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> > +{
> > +}
> > +#endif
> > +
>
> The patch should also delete memmap_valid_within() and replace it with a
> call to pfn_valid_within(). The reason memmap_valid_within() existed was
> because sparsemem had holes punched in it but I'd rather not see use of
> that function grow.
>
> > #ifdef CONFIG_MEMORY_HOTPLUG
> > #ifdef CONFIG_SPARSEMEM_VMEMMAP
> > static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
> > --
> > 1.7.0.5
> >
>
> --
> Mel Gorman
> Part-time Phd Student Linux Technology Center
> University of Limerick IBM Dublin Software Lab
--
Kind regards,
Minchan Kim