LinuxLists.cc - [PATCH] mm: use managed_zone() for more exact check in zone iteration

2018-11-14 23:52:10

Subject: [PATCH] mm: use managed_zone() for more exact check in zone iteration

For one zone, there are three digits to describe its space range:

spanned_pages
present_pages
managed_pages

The detailed meaning is written in include/linux/mmzone.h. This patch
concerns about the last two.

present_pages is physical pages existing within the zone
managed_pages is present pages managed by the buddy system

From the definition, managed_pages is a more strict condition than
present_pages.

There are two functions using zone's present_pages as a boundary:

populated_zone()
for_each_populated_zone()

By going through the kernel tree, most of their users are willing to
access pages managed by the buddy system, which means it is more exact
to check zone's managed_pages for a validation.

This patch replaces those checks on present_pages to managed_pages by:

* change for_each_populated_zone() to for_each_managed_zone()
* convert for_each_populated_zone() to for_each_zone() and check
populated_zone() where is necessary
* change populated_zone() to managed_zone() at proper places

Signed-off-by: Wei Yang <[email protected]>

---

Michal, after last mail, I did one more thing to replace
populated_zone() with managed_zone() at proper places.

One thing I am not sure is those places in mm/compaction.c. I have
chaged them. If not, please let me know.

BTW, I did a boot up test with the patched kernel and looks smooth.
---
arch/s390/mm/page-states.c | 2 +-
include/linux/mmzone.h | 8 +++-----
kernel/power/snapshot.c | 31 ++++++++++++++++++++-----------
mm/compaction.c | 8 ++++----
mm/highmem.c | 5 ++---
mm/huge_memory.c | 2 +-
mm/khugepaged.c | 2 +-
mm/madvise.c | 2 +-
mm/migrate.c | 2 +-
mm/page-writeback.c | 4 ++--
mm/page_alloc.c | 19 +++++++++----------
mm/vmstat.c | 14 +++++++-------
12 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index dc3cede7f2ec..015430bf0c63 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -265,7 +265,7 @@ void arch_set_page_states(int make_stable)
return;
if (make_stable)
drain_local_pages(NULL);
- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
spin_lock_irqsave(&zone->lock, flags);
for_each_migratetype_order(order, t) {
list_for_each(l, &zone->free_area[order].free_list[t]) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 847705a6d0ec..2174baba0546 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -937,11 +937,9 @@ extern struct zone *next_zone(struct zone *zone);
zone; \
zone = next_zone(zone))

-#define for_each_populated_zone(zone) \
- for (zone = (first_online_pgdat())->node_zones; \
- zone; \
- zone = next_zone(zone)) \
- if (!populated_zone(zone)) \
+#define for_each_managed_zone(zone) \
+ for_each_zone(zone) \
+ if (!managed_zone(zone)) \
; /* do nothing */ \
else

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b0308a2c6000..aa99efa73d89 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -592,10 +592,13 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)

INIT_LIST_HEAD(list);

- for_each_populated_zone(zone) {
+ for_each_zone(zone) {
unsigned long zone_start, zone_end;
struct mem_extent *ext, *cur, *aux;

+ if (!populated_zone(zone))
+ continue;
+
zone_start = zone->zone_start_pfn;
zone_end = zone_end_pfn(zone);

@@ -1193,8 +1196,8 @@ static unsigned int count_free_highmem_pages(void)
struct zone *zone;
unsigned int cnt = 0;

- for_each_populated_zone(zone)
- if (is_highmem(zone))
+ for_each_zone(zone)
+ if (populated_zone(zone) && is_highmem(zone))
cnt += zone_page_state(zone, NR_FREE_PAGES);

return cnt;
@@ -1239,10 +1242,10 @@ static unsigned int count_highmem_pages(void)
struct zone *zone;
unsigned int n = 0;

- for_each_populated_zone(zone) {
+ for_each_zone(zone) {
unsigned long pfn, max_zone_pfn;

- if (!is_highmem(zone))
+ if (!populated_zone(zone) || !is_highmem(zone))
continue;

mark_free_pages(zone);
@@ -1305,8 +1308,8 @@ static unsigned int count_data_pages(void)
unsigned long pfn, max_zone_pfn;
unsigned int n = 0;

- for_each_populated_zone(zone) {
- if (is_highmem(zone))
+ for_each_zone(zone) {
+ if (!populated_zone(zone) || is_highmem(zone))
continue;

mark_free_pages(zone);
@@ -1399,9 +1402,12 @@ static void copy_data_pages(struct memory_bitmap *copy_bm,
struct zone *zone;
unsigned long pfn;

- for_each_populated_zone(zone) {
+ for_each_zone(zone) {
unsigned long max_zone_pfn;

+ if (!populated_zone(zone))
+ continue;
+
mark_free_pages(zone);
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
@@ -1717,7 +1723,10 @@ int hibernate_preallocate_memory(void)
saveable += save_highmem;
highmem = save_highmem;
size = 0;
- for_each_populated_zone(zone) {
+ for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+
size += snapshot_additional_pages(zone);
if (is_highmem(zone))
highmem += zone_page_state(zone, NR_FREE_PAGES);
@@ -1863,8 +1872,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
struct zone *zone;
unsigned int free = alloc_normal;

- for_each_populated_zone(zone)
- if (!is_highmem(zone))
+ for_each_zone(zone)
+ if (populated_zone(zone) && !is_highmem(zone))
free += zone_page_state(zone, NR_FREE_PAGES);

nr_pages += count_pages_for_highmem(nr_highmem);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7c607479de4a..8867c011dd45 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -276,7 +276,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)

for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
struct zone *zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

/* Only flush if a full compaction finished recently */
@@ -1832,7 +1832,7 @@ static void compact_node(int nid)
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {

zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

cc.nr_freepages = 0;
@@ -1927,7 +1927,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
zone = &pgdat->node_zones[zoneid];

- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
@@ -1963,7 +1963,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
int status;

zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

if (compaction_deferred(zone, cc.order))
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db3223a5d6..1edc0539d25a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -111,15 +111,14 @@ EXPORT_SYMBOL(totalhigh_pages);

EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);

-unsigned int nr_free_highpages (void)
+unsigned int nr_free_highpages(void)
{
struct zone *zone;
unsigned int pages = 0;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone)
if (is_highmem(zone))
pages += zone_page_state(zone, NR_FREE_PAGES);
- }

return pages;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e4ef8fa479d..d0f97b29c96f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2829,7 +2829,7 @@ static int split_huge_pages_set(void *data, u64 val)
if (val != 1)
return -EINVAL;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
if (!pfn_valid(pfn))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c13625c1ad5e..4c3ec240d4d9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1845,7 +1845,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
/*
* We don't need to worry about fragmentation of
* ZONE_MOVABLE since it only has movable pages.
diff --git a/mm/madvise.c b/mm/madvise.c
index 6cb1ca93e290..3a8eced61107 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -677,7 +677,7 @@ static int madvise_inject_error(int behavior,
}

/* Ensure that all poisoned pages are removed from per-cpu lists */
- for_each_populated_zone(zone)
+ for_each_managed_zone(zone)
drain_all_pages(zone);

return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index f7e4bfdc13b7..5c4846dd0ae3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1819,7 +1819,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;

- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

/* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f690bae6b78..076f51e86149 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -281,7 +281,7 @@ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = pgdat->node_zones + z;

- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;

nr_pages += zone_page_state(zone, NR_FREE_PAGES);
@@ -316,7 +316,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
continue;

z = &NODE_DATA(node)->node_zones[i];
- if (!populated_zone(z))
+ if (!managed_zone(z))
continue;

nr_pages = zone_page_state(z, NR_FREE_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c20d3c76bd59..458bd81cf75c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1747,7 +1747,7 @@ void __init page_alloc_init_late(void)
memblock_discard();
#endif

- for_each_populated_zone(zone)
+ for_each_managed_zone(zone)
set_zone_contiguous(zone);
}

@@ -2569,9 +2569,8 @@ static void drain_pages(unsigned int cpu)
{
struct zone *zone;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone)
drain_pages_zone(cpu, zone);
- }
}

/*
@@ -2655,7 +2654,7 @@ void drain_all_pages(struct zone *zone)
if (pcp->pcp.count)
has_pcps = true;
} else {
- for_each_populated_zone(z) {
+ for_each_managed_zone(z) {
pcp = per_cpu_ptr(z->pageset, cpu);
if (pcp->pcp.count) {
has_pcps = true;
@@ -4857,7 +4856,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
struct zone *zone;
pg_data_t *pgdat;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;

@@ -4940,7 +4939,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
"yes" : "no");
}

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
int i;

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
@@ -4999,7 +4998,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT "\n");
}

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
unsigned int order;
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
@@ -5787,7 +5786,7 @@ void __init setup_per_cpu_pageset(void)
struct pglist_data *pgdat;
struct zone *zone;

- for_each_populated_zone(zone)
+ for_each_managed_zone(zone)
setup_zone_pageset(zone);

for_each_online_pgdat(pgdat)
@@ -6905,7 +6904,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (populated_zone(zone)) {
+ if (managed_zone(zone)) {
if (IS_ENABLED(CONFIG_HIGHMEM))
node_set_state(nid, N_HIGH_MEMORY);
if (zone_type <= ZONE_NORMAL)
@@ -7581,7 +7580,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
unsigned int cpu;

for_each_possible_cpu(cpu)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6038ce593ce3..06cd9e9ecba1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -53,7 +53,7 @@ static void zero_zones_numa_counters(void)
{
struct zone *zone;

- for_each_populated_zone(zone)
+ for_each_managed_zone(zone)
zero_zone_numa_counters(zone);
}

@@ -256,7 +256,7 @@ void refresh_zone_stat_thresholds(void)
}
}

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long max_drift, tolerate_drift;

@@ -753,7 +753,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
struct per_cpu_pageset __percpu *p = zone->pageset;

for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
@@ -854,7 +854,7 @@ void cpu_vm_stats_fold(int cpu)
#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
struct per_cpu_pageset *p;

p = per_cpu_ptr(zone->pageset, cpu);
@@ -1578,8 +1578,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
seq_putc(m, ')');

- /* If unpopulated, no other information is useful */
- if (!populated_zone(zone)) {
+ /* If unmanaged, no other information is useful */
+ if (!managed_zone(zone)) {
seq_putc(m, '\n');
return;
}
@@ -1817,7 +1817,7 @@ static bool need_update(int cpu)
{
struct zone *zone;

- for_each_populated_zone(zone) {
+ for_each_managed_zone(zone) {
struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);

BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
--
2.15.1

2018-11-15 21:40:05

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Thu, 15 Nov 2018 07:50:40 +0800 Wei Yang <[email protected]> wrote:

> For one zone, there are three digits to describe its space range:
>
> spanned_pages
> present_pages
> managed_pages
>
> The detailed meaning is written in include/linux/mmzone.h. This patch
> concerns about the last two.
>
> present_pages is physical pages existing within the zone
> managed_pages is present pages managed by the buddy system
>
> >From the definition, managed_pages is a more strict condition than
> present_pages.
>
> There are two functions using zone's present_pages as a boundary:
>
> populated_zone()
> for_each_populated_zone()
>
> By going through the kernel tree, most of their users are willing to
> access pages managed by the buddy system, which means it is more exact
> to check zone's managed_pages for a validation.
>
> This patch replaces those checks on present_pages to managed_pages by:
>
> * change for_each_populated_zone() to for_each_managed_zone()
> * convert for_each_populated_zone() to for_each_zone() and check
> populated_zone() where is necessary
> * change populated_zone() to managed_zone() at proper places
>
> Signed-off-by: Wei Yang <[email protected]>
>
> ---
>
> Michal, after last mail, I did one more thing to replace
> populated_zone() with managed_zone() at proper places.
>
> One thing I am not sure is those places in mm/compaction.c. I have
> chaged them. If not, please let me know.
>
> BTW, I did a boot up test with the patched kernel and looks smooth.

Seems sensible, but a bit scary. A basic boot test is unlikely to
expose subtle gremlins.

Worse, the situations in which managed_zone() != populated_zone() are
rare(?), so it will take a long time for problems to be discovered, I
expect.

I'll toss it in there for now, let's see who breaks :(

2018-11-16 04:43:49

by Wei Yang

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Thu, Nov 15, 2018 at 01:37:35PM -0800, Andrew Morton wrote:
>On Thu, 15 Nov 2018 07:50:40 +0800 Wei Yang <[email protected]> wrote:
>
>> For one zone, there are three digits to describe its space range:
>>
>> spanned_pages
>> present_pages
>> managed_pages
>>
>> The detailed meaning is written in include/linux/mmzone.h. This patch
>> concerns about the last two.
>>
>> present_pages is physical pages existing within the zone
>> managed_pages is present pages managed by the buddy system
>>
>> >From the definition, managed_pages is a more strict condition than
>> present_pages.
>>
>> There are two functions using zone's present_pages as a boundary:
>>
>> populated_zone()
>> for_each_populated_zone()
>>
>> By going through the kernel tree, most of their users are willing to
>> access pages managed by the buddy system, which means it is more exact
>> to check zone's managed_pages for a validation.
>>
>> This patch replaces those checks on present_pages to managed_pages by:
>>
>> * change for_each_populated_zone() to for_each_managed_zone()
>> * convert for_each_populated_zone() to for_each_zone() and check
>> populated_zone() where is necessary
>> * change populated_zone() to managed_zone() at proper places
>>
>> Signed-off-by: Wei Yang <[email protected]>
>>
>> ---
>>
>> Michal, after last mail, I did one more thing to replace
>> populated_zone() with managed_zone() at proper places.
>>
>> One thing I am not sure is those places in mm/compaction.c. I have
>> chaged them. If not, please let me know.
>>
>> BTW, I did a boot up test with the patched kernel and looks smooth.
>
>Seems sensible, but a bit scary. A basic boot test is unlikely to
>expose subtle gremlins.
>

Agree.

>Worse, the situations in which managed_zone() != populated_zone() are
>rare(?), so it will take a long time for problems to be discovered, I
>expect.

Hmm... I created a virtual machine with 4 nodes, which has total 6
populated zones. All of them are different.

This is a little bit out of my expactation.

>
>I'll toss it in there for now, let's see who breaks :(

Thanks.

--
Wei Yang
Help you, Help me

2018-11-16 09:58:14

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Thu 15-11-18 13:37:35, Andrew Morton wrote:
[...]
> Worse, the situations in which managed_zone() != populated_zone() are
> rare(?), so it will take a long time for problems to be discovered, I
> expect.

We would basically have to deplete the whole zone by the bootmem
allocator or pull out all pages from the page allocator. E.g. memory
hotplug decreases both managed and present counters. I am actually not
sure that is 100% correct (put on my TODO list to check). There is no
consistency in that regards.

That being said, I will review the patch (today hopefully) but
fundamentally most users should indeed care about managed pages when
iterating zones with memory. There should be a good reason why they
might want to look at reserved pages.

--
Michal Hocko
SUSE Labs

2018-11-16 10:26:16

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Thu 15-11-18 07:50:40, Wei Yang wrote:
[...]
> @@ -1193,8 +1196,8 @@ static unsigned int count_free_highmem_pages(void)
> struct zone *zone;
> unsigned int cnt = 0;
>
> - for_each_populated_zone(zone)
> - if (is_highmem(zone))
> + for_each_zone(zone)
> + if (populated_zone(zone) && is_highmem(zone))
> cnt += zone_page_state(zone, NR_FREE_PAGES);

this should be for_each_managed_zone because we only care about highmem
zones which have pages in the allocator (NR_FREE_PAGES).

>
> return cnt;
> @@ -1239,10 +1242,10 @@ static unsigned int count_highmem_pages(void)
> struct zone *zone;
> unsigned int n = 0;
>
> - for_each_populated_zone(zone) {
> + for_each_zone(zone) {
> unsigned long pfn, max_zone_pfn;
>
> - if (!is_highmem(zone))
> + if (!populated_zone(zone) || !is_highmem(zone))
> continue;
>
> mark_free_pages(zone);

I am not familiar with this code much but I strongly suspect that we do
want for_each_managed_zone here because saveable_highmem_page skips over
all reserved pages which rules out the bootmem. But this should be
double checked with Rafael (Cc-ed).

Rafael, does this loop care about pages which are not managed by the
page allocator?

> @@ -1305,8 +1308,8 @@ static unsigned int count_data_pages(void)
> unsigned long pfn, max_zone_pfn;
> unsigned int n = 0;
>
> - for_each_populated_zone(zone) {
> - if (is_highmem(zone))
> + for_each_zone(zone) {
> + if (!populated_zone(zone) || is_highmem(zone))
> continue;
>
> mark_free_pages(zone);
> @@ -1399,9 +1402,12 @@ static void copy_data_pages(struct memory_bitmap *copy_bm,
> struct zone *zone;
> unsigned long pfn;
>
> - for_each_populated_zone(zone) {
> + for_each_zone(zone) {
> unsigned long max_zone_pfn;
>
> + if (!populated_zone(zone))
> + continue;
> +
> mark_free_pages(zone);
> max_zone_pfn = zone_end_pfn(zone);
> for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
> @@ -1717,7 +1723,10 @@ int hibernate_preallocate_memory(void)
> saveable += save_highmem;
> highmem = save_highmem;
> size = 0;
> - for_each_populated_zone(zone) {
> + for_each_zone(zone) {
> + if (!populated_zone(zone))
> + continue;
> +
> size += snapshot_additional_pages(zone);
> if (is_highmem(zone))
> highmem += zone_page_state(zone, NR_FREE_PAGES);

ditto for the above.

> @@ -1863,8 +1872,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
> struct zone *zone;
> unsigned int free = alloc_normal;
>
> - for_each_populated_zone(zone)
> - if (!is_highmem(zone))
> + for_each_zone(zone)
> + if (populated_zone(zone) && !is_highmem(zone))
> free += zone_page_state(zone, NR_FREE_PAGES);
>
> nr_pages += count_pages_for_highmem(nr_highmem);

This one should be for_each_managed_zone (NR_FREE_PAGES)

The rest looks good to me.
--
Michal Hocko
SUSE Labs

2018-11-16 11:06:42

by Oscar Salvador

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Fri, 2018-11-16 at 10:57 +0100, Michal Hocko wrote:
> On Thu 15-11-18 13:37:35, Andrew Morton wrote:
> [...]
> > Worse, the situations in which managed_zone() != populated_zone()
> > are
> > rare(?), so it will take a long time for problems to be discovered,
> > I
> > expect.
>
> We would basically have to deplete the whole zone by the bootmem
> allocator or pull out all pages from the page allocator. E.g. memory
> hotplug decreases both managed and present counters. I am actually
> not
> sure that is 100% correct (put on my TODO list to check). There is no
> consistency in that regards.

We can only offline non-reserved pages (so, managed pages).
Since present pages holds reserved_pages + managed_pages, decreasing
both should be fine unless I am mistaken.

Oscar Salvador

2018-11-16 11:27:21

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Fri 16-11-18 12:05:04, osalvador wrote:
> On Fri, 2018-11-16 at 10:57 +0100, Michal Hocko wrote:
[...]
> > E.g. memory hotplug decreases both managed and present counters. I
> > am actually not sure that is 100% correct (put on my TODO list to
> > check). There is no consistency in that regards.
>
> We can only offline non-reserved pages (so, managed pages).

Yes

> Since present pages holds reserved_pages + managed_pages, decreasing
> both should be fine unless I am mistaken.

Well, present_pages is defined as "physical pages existing within the zone"
and those pages are still existing but they are offline. But as I've
said I have to think about it some more
--
Michal Hocko
SUSE Labs

2018-11-16 15:59:34

by Wei Yang

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Fri, Nov 16, 2018 at 12:26:03PM +0100, Michal Hocko wrote:
>On Fri 16-11-18 12:05:04, osalvador wrote:
>> On Fri, 2018-11-16 at 10:57 +0100, Michal Hocko wrote:
>[...]
>> > E.g. memory hotplug decreases both managed and present counters. I
>> > am actually not sure that is 100% correct (put on my TODO list to
>> > check). There is no consistency in that regards.
>>
>> We can only offline non-reserved pages (so, managed pages).
>
>Yes
>
>> Since present pages holds reserved_pages + managed_pages, decreasing
>> both should be fine unless I am mistaken.
>
>Well, present_pages is defined as "physical pages existing within the zone"
>and those pages are still existing but they are offline. But as I've
>said I have to think about it some more

I may not catch up with your discussions, while I'd like to share what I
learnt.

online_pages()
online_pages_range()
zone->present_pages += onlined_pages;

__offline_pages()
adjust_managed_page_count()
zone->present_pages -= offlined_pages;

The two counters: present_pages & managed_pages would be adjusted during
online/offline.

While I am not sure when *reserved_pages* would be adjusted. Will we add
this hot-added memory into memblock.reserved? and allocate memory by
memblock_alloc() after system bootup?

>--
>Michal Hocko
>SUSE Labs

--
Wei Yang
Help you, Help me

2018-11-16 17:08:59

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Fri 16-11-18 15:58:28, Wei Yang wrote:
> On Fri, Nov 16, 2018 at 12:26:03PM +0100, Michal Hocko wrote:
> >On Fri 16-11-18 12:05:04, osalvador wrote:
> >> On Fri, 2018-11-16 at 10:57 +0100, Michal Hocko wrote:
> >[...]
> >> > E.g. memory hotplug decreases both managed and present counters. I
> >> > am actually not sure that is 100% correct (put on my TODO list to
> >> > check). There is no consistency in that regards.
> >>
> >> We can only offline non-reserved pages (so, managed pages).
> >
> >Yes
> >
> >> Since present pages holds reserved_pages + managed_pages, decreasing
> >> both should be fine unless I am mistaken.
> >
> >Well, present_pages is defined as "physical pages existing within the zone"
> >and those pages are still existing but they are offline. But as I've
> >said I have to think about it some more
>
> I may not catch up with your discussions, while I'd like to share what I
> learnt.
>
> online_pages()
> online_pages_range()
> zone->present_pages += onlined_pages;
>
> __offline_pages()
> adjust_managed_page_count()
> zone->present_pages -= offlined_pages;
>
> The two counters: present_pages & managed_pages would be adjusted during
> online/offline.
>
> While I am not sure when *reserved_pages* would be adjusted. Will we add
> this hot-added memory into memblock.reserved? and allocate memory by
> memblock_alloc() after system bootup?

This is not really related to this patch. I have only mentioned the
memory hotplug as an example. I would rather focus on the change itself
so let's not get too off topic here.

--
Michal Hocko
SUSE Labs

2018-11-21 03:18:12

by Wei Yang

[permalink] [raw]

Subject: Re: [PATCH] mm: use managed_zone() for more exact check in zone iteration

On Fri, Nov 16, 2018 at 11:24:05AM +0100, Michal Hocko wrote:
>On Thu 15-11-18 07:50:40, Wei Yang wrote:
>[...]
>> @@ -1193,8 +1196,8 @@ static unsigned int count_free_highmem_pages(void)
>> struct zone *zone;
>> unsigned int cnt = 0;
>>
>> - for_each_populated_zone(zone)
>> - if (is_highmem(zone))
>> + for_each_zone(zone)
>> + if (populated_zone(zone) && is_highmem(zone))
>> cnt += zone_page_state(zone, NR_FREE_PAGES);
>
>this should be for_each_managed_zone because we only care about highmem
>zones which have pages in the allocator (NR_FREE_PAGES).
>
>>
>> return cnt;
>> @@ -1239,10 +1242,10 @@ static unsigned int count_highmem_pages(void)
>> struct zone *zone;
>> unsigned int n = 0;
>>
>> - for_each_populated_zone(zone) {
>> + for_each_zone(zone) {
>> unsigned long pfn, max_zone_pfn;
>>
>> - if (!is_highmem(zone))
>> + if (!populated_zone(zone) || !is_highmem(zone))
>> continue;
>>
>> mark_free_pages(zone);
>
>I am not familiar with this code much but I strongly suspect that we do
>want for_each_managed_zone here because saveable_highmem_page skips over
>all reserved pages which rules out the bootmem. But this should be
>double checked with Rafael (Cc-ed).
>
>Rafael, does this loop care about pages which are not managed by the
>page allocator?
>

Hi, Rafael

Your opinion on this change and the following one is appreciated :-)

>> @@ -1305,8 +1308,8 @@ static unsigned int count_data_pages(void)
>> unsigned long pfn, max_zone_pfn;
>> unsigned int n = 0;
>>
>> - for_each_populated_zone(zone) {
>> - if (is_highmem(zone))
>> + for_each_zone(zone) {
>> + if (!populated_zone(zone) || is_highmem(zone))
>> continue;
>>
>> mark_free_pages(zone);
>> @@ -1399,9 +1402,12 @@ static void copy_data_pages(struct memory_bitmap *copy_bm,
>> struct zone *zone;
>> unsigned long pfn;
>>
>> - for_each_populated_zone(zone) {
>> + for_each_zone(zone) {
>> unsigned long max_zone_pfn;
>>
>> + if (!populated_zone(zone))
>> + continue;
>> +
>> mark_free_pages(zone);
>> max_zone_pfn = zone_end_pfn(zone);
>> for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
>> @@ -1717,7 +1723,10 @@ int hibernate_preallocate_memory(void)
>> saveable += save_highmem;
>> highmem = save_highmem;
>> size = 0;
>> - for_each_populated_zone(zone) {
>> + for_each_zone(zone) {
>> + if (!populated_zone(zone))
>> + continue;
>> +
>> size += snapshot_additional_pages(zone);
>> if (is_highmem(zone))
>> highmem += zone_page_state(zone, NR_FREE_PAGES);
>
>ditto for the above.
>
>
>> @@ -1863,8 +1872,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
>> struct zone *zone;
>> unsigned int free = alloc_normal;
>>
>> - for_each_populated_zone(zone)
>> - if (!is_highmem(zone))
>> + for_each_zone(zone)
>> + if (populated_zone(zone) && !is_highmem(zone))
>> free += zone_page_state(zone, NR_FREE_PAGES);
>>
>> nr_pages += count_pages_for_highmem(nr_highmem);
>
>This one should be for_each_managed_zone (NR_FREE_PAGES)
>
>The rest looks good to me.
>--
>Michal Hocko
>SUSE Labs

--
Wei Yang
Help you, Help me