When we steal whole pageblock, we don't need to break highest order
freepage. Perhaps, there is small order freepage so we can use it.
This also gives us some code size reduction because expand() which
is used in __rmqueue_fallback() and inlined into __rmqueue_fallback()
is removed.
text data bss dec hex filename
37413 1440 624 39477 9a35 mm/page_alloc.o
37249 1440 624 39313 9991 mm/page_alloc.o
Signed-off-by: Joonsoo Kim <[email protected]>
---
mm/page_alloc.c | 40 +++++++++++++++++++++-------------------
1 file changed, 21 insertions(+), 19 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed0f1c6..044f16c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1239,14 +1239,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
}
/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+static inline bool
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
unsigned int current_order;
struct page *page;
int fallback_mt;
- bool can_steal;
+ bool can_steal_pageblock;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
@@ -1254,26 +1254,24 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, false,
+ &can_steal_pageblock);
if (fallback_mt == -1)
continue;
page = list_entry(area->free_list[fallback_mt].next,
struct page, lru);
- if (can_steal)
+ BUG_ON(!page);
+
+ if (can_steal_pageblock)
steal_suitable_fallback(zone, page, start_migratetype);
- /* Remove the page from the freelists */
- area->nr_free--;
- list_del(&page->lru);
- rmv_page_order(page);
+ list_move(&page->lru, &area->free_list[start_migratetype]);
- expand(zone, page, order, current_order, area,
- start_migratetype);
/*
* The freepage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
+ * find_suitable_fallback(). This is OK as long as it
* does not differ for MIGRATE_CMA pageblocks. For CMA
* we need to make sure unallocated pages flushed from
* pcp lists are returned to the correct freelist.
@@ -1283,10 +1281,10 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
- return page;
+ return true;
}
- return NULL;
+ return false;
}
/*
@@ -1297,28 +1295,32 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
+ bool steal_fallback;
-retry_reserve:
+retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (page)
+ goto out;
+
+ steal_fallback = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
- if (!page) {
+ if (!steal_fallback)
migratetype = MIGRATE_RESERVE;
- goto retry_reserve;
- }
+
+ goto retry;
}
+out:
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
--
1.9.1
Sometimes we try to get more freepages from buddy list than how much
we really need, in order to refill pcp list. This may speed up following
allocation request, but, there is a possibility to increase fragmentation
if we steal freepages from other migratetype buddy list excessively. This
patch changes this behaviour to stop fallback allocation in order to
reduce fragmentation if we already get some freepages.
CPU: 8
RAM: 512 MB with zram swap
WORKLOAD: kernel build with -j12
OPTION: page owner is enabled to measure fragmentation
After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
* Before
Number of blocks type (movable)
DMA32: 208.4
Number of mixed blocks (movable)
DMA32: 139
Mixed blocks means that there is one or more allocated page for
unmovable/reclaimable allocation in movable pageblock. Results shows that
more than half of movable pageblock is tainted by other migratetype
allocation.
* After
Number of blocks type (movable)
DMA32: 207
Number of mixed blocks (movable)
DMA32: 111.2
This result shows that non-mixed block increase by 38% in this case.
Signed-off-by: Joonsoo Kim <[email protected]>
---
mm/page_alloc.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 044f16c..fbe2211 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1292,7 +1292,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+ int migratetype, int index)
{
struct page *page;
bool steal_fallback;
@@ -1301,6 +1301,10 @@ retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+ /* We already get some freepages so don't do agressive steal */
+ if (index != 0)
+ goto out;
+
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
@@ -1338,7 +1342,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page = __rmqueue(zone, order, migratetype, i);
if (unlikely(page == NULL))
break;
@@ -1749,7 +1753,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype, 0);
spin_unlock(&zone->lock);
if (!page)
goto failed;
--
1.9.1
We already have antifragmentation policy in page allocator. It works well
when system memory is sufficient, but, it doesn't works well when system
memory isn't sufficient because memory is already highly fragmented and
fallback/steal mechanism cannot get whole pageblock. If there is severe
unmovable allocation requestor like zram, problem could get worse.
CPU: 8
RAM: 512 MB with zram swap
WORKLOAD: kernel build with -j12
OPTION: page owner is enabled to measure fragmentation
After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
* Before
Number of blocks type (movable)
DMA32: 207
Number of mixed blocks (movable)
DMA32: 111.2
Mixed blocks means that there is one or more allocated page for
unmovable/reclaimable allocation in movable pageblock. Results shows that
more than half of movable pageblock is tainted by other migratetype
allocation.
To mitigate this fragmentation, this patch implements active
anti-fragmentation algorithm. Idea is really simple. When some
unmovable/reclaimable steal happens from movable pageblock, we try to
migrate out other pages that can be migratable in this pageblock are and
use these generated freepage for further allocation request of
corresponding migratetype.
Once unmovable allocation taints movable pageblock, it cannot easily
recover. Instead of praying that it gets restored, making it unmovable
pageblock as much as possible and using it further unmovable request
would be more reasonable approach.
Below is result of this idea.
* After
Number of blocks type (movable)
DMA32: 208.2
Number of mixed blocks (movable)
DMA32: 55.8
Result shows that non-mixed block increase by 59% in this case.
Signed-off-by: Joonsoo Kim <[email protected]>
---
include/linux/compaction.h | 8 +++
include/linux/gfp.h | 3 +
mm/compaction.c | 156 +++++++++++++++++++++++++++++++++++++++++++++
mm/internal.h | 1 +
mm/page_alloc.c | 32 +++++++++-
mm/page_isolation.c | 24 -------
6 files changed, 198 insertions(+), 26 deletions(-)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index aa8f61c..3a6bb81 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -51,6 +51,9 @@ extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
extern bool compaction_restarting(struct zone *zone, int order);
+extern void wake_up_antifrag(struct zone *zone, unsigned long base_pfn,
+ int mt, int nr_moved);
+
#else
static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, int alloc_flags,
@@ -83,6 +86,11 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return true;
}
+static inline void wake_up_antifrag(struct zone *zone, unsigned long base_pfn,
+ int mt, int nr_moved)
+{
+}
+
#endif /* CONFIG_COMPACTION */
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a9373..a0dec6c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -355,6 +355,9 @@ void free_pages_exact(void *virt, size_t size);
/* This is different from alloc_pages_exact_node !!! */
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
+extern struct page *alloc_migrate_target(struct page *page,
+ unsigned long private, int **resultp);
+
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08d..0d76b9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,10 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <[email protected]>
*/
+
+#define pr_fmt(fmt) "compact: " fmt
+#define DEBUG
+
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -663,6 +667,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (cc->mode == MIGRATE_ASYNC)
return 0;
+ if (cc->ignore_congestion_wait)
+ break;
+
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
@@ -1714,4 +1721,153 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+#define NUM_ANTIFRAG_INFOS (100)
+
+struct antifrag_info {
+ struct work_struct antifrag_work;
+ struct list_head list;
+ unsigned long base_pfn;
+ int mt;
+ int nr_moved;
+};
+
+static DEFINE_SPINLOCK(infos_lock);
+static LIST_HEAD(free_infos);
+static struct antifrag_info infos[NUM_ANTIFRAG_INFOS];
+
+static unsigned long pending;
+static bool antifrag_initialized;
+
+void wake_up_antifrag(struct zone *zone, unsigned long base_pfn,
+ int mt, int nr_moved)
+{
+ struct antifrag_info *info;
+ unsigned long flags;
+
+ if (unlikely(!antifrag_initialized))
+ return;
+
+ spin_lock_irqsave(&infos_lock, flags);
+ if (list_empty(&free_infos)) {
+ spin_unlock_irqrestore(&infos_lock, flags);
+ return;
+ }
+
+ info = list_first_entry(&free_infos, struct antifrag_info, list);
+ list_del(&info->list);
+ pending++;
+ spin_unlock_irqrestore(&infos_lock, flags);
+
+ info->base_pfn = base_pfn;
+ info->mt = mt;
+ info->nr_moved = nr_moved;
+
+ pr_debug("%s: %d: wakeup (0x%lx, %d, %d, %lu)\n",
+ __func__, smp_processor_id(), base_pfn, mt, nr_moved, pending);
+ queue_work(system_highpri_wq, &info->antifrag_work);
+}
+
+static void empty_pageblock(unsigned long base_pfn, int mt, int nr_moved)
+{
+ int cpu;
+ int ret = 0;
+ int curr_moved = 0;
+ int count = 0;
+ unsigned long start, end, pfn;
+ unsigned long empty_threshold = 1 << (pageblock_order - 1);
+ struct page *base_page = pfn_to_page(base_pfn);
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(base_pfn)),
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .ignore_congestion_wait = true,
+ };
+ LIST_HEAD(isolated_pages);
+
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ start = round_down(base_pfn, pageblock_nr_pages);
+ end = start + pageblock_nr_pages;
+ pfn = start;
+ while (pfn < end) {
+ if (fatal_signal_pending(current))
+ break;
+
+ cc.nr_migratepages = 0;
+ pfn = isolate_migratepages_range(&cc, pfn, end);
+ if (!pfn)
+ break;
+
+ count += cc.nr_migratepages;
+ list_splice_tail_init(&cc.migratepages, &isolated_pages);
+ }
+
+ if (count && nr_moved + count >= empty_threshold) {
+ spin_lock_irq(&cc.zone->lock);
+ set_pageblock_migratetype(base_page, mt);
+ curr_moved = move_freepages_block(cc.zone, base_page, mt);
+ spin_unlock_irq(&cc.zone->lock);
+
+ ret = migrate_pages(&isolated_pages, alloc_migrate_target,
+ NULL, __GFP_MEMALLOC, cc.mode, MR_COMPACTION);
+ if (ret > 0)
+ count -= ret;
+
+ cpu = get_cpu();
+ lru_add_drain_cpu(cpu);
+ drain_local_pages(cc.zone);
+ put_cpu();
+
+ spin_lock_irq(&cc.zone->lock);
+ curr_moved = move_freepages_block(cc.zone, base_page, mt);
+ spin_unlock_irq(&cc.zone->lock);
+
+ pr_debug("%s: %d: emptying success (0x%lx, %d, %d, %d %lu)\n",
+ __func__, smp_processor_id(),
+ base_pfn, mt, nr_moved, curr_moved, pending);
+ } else
+ pr_debug("%s: %d: emptying skipped (0x%lx, %d, %d, %d %lu)\n",
+ __func__, smp_processor_id(),
+ base_pfn, mt, nr_moved, nr_moved + count, pending);
+
+ putback_movable_pages(&isolated_pages);
+}
+
+static void do_antifrag(struct work_struct *work)
+{
+ struct antifrag_info *info =
+ container_of(work, struct antifrag_info, antifrag_work);
+
+ pr_debug("%s: %d: worker (0x%lx, %d, %d, %lu)\n",
+ __func__, smp_processor_id(),
+ info->base_pfn, info->mt, info->nr_moved, pending);
+
+ empty_pageblock(info->base_pfn, info->mt, info->nr_moved);
+
+ spin_lock_irq(&infos_lock);
+ list_add(&info->list, &free_infos);
+ pending--;
+ spin_unlock_irq(&infos_lock);
+}
+
+static int __init antifrag_init(void)
+{
+ int i;
+
+ spin_lock_irq(&infos_lock);
+ for (i = 0; i < NUM_ANTIFRAG_INFOS; i++) {
+ INIT_LIST_HEAD(&infos[i].list);
+ INIT_WORK(&infos[i].antifrag_work, do_antifrag);
+ list_add(&infos[i].list, &free_infos);
+ }
+ spin_unlock_irq(&infos_lock);
+
+ antifrag_initialized = true;
+
+ return 0;
+}
+module_init(antifrag_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/internal.h b/mm/internal.h
index a25e359..78527d7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -192,6 +192,7 @@ struct compact_control {
* contention detected during
* compaction
*/
+ bool ignore_congestion_wait;
};
unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fbe2211..0878ac5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1182,7 +1182,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
* use it's pages as requested migratetype in the future.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type)
+ int start_type, int fallback_type)
{
int current_order = page_order(page);
int pages;
@@ -1194,6 +1194,8 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
}
pages = move_freepages_block(zone, page, start_type);
+ if (start_type != MIGRATE_MOVABLE && fallback_type == MIGRATE_MOVABLE)
+ wake_up_antifrag(zone, page_to_pfn(page), start_type, pages);
/* Claim the whole block if over half of it is free */
if (pages >= (1 << (pageblock_order-1)) ||
@@ -1264,7 +1266,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
BUG_ON(!page);
if (can_steal_pageblock)
- steal_suitable_fallback(zone, page, start_migratetype);
+ steal_suitable_fallback(zone, page,
+ start_migratetype, fallback_mt);
list_move(&page->lru, &area->free_list[start_migratetype]);
@@ -6534,6 +6537,31 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+ int **resultp)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE | private;
+
+ /*
+ * TODO: allocate a destination hugepage from a nearest neighbor node,
+ * accordance with memory policy of the user process if possible. For
+ * now as a simple work-around, we use the next node for destination.
+ */
+ if (PageHuge(page)) {
+ nodemask_t src = nodemask_of_node(page_to_nid(page));
+ nodemask_t dst;
+
+ nodes_complement(dst, src);
+ return alloc_huge_page_node(page_hstate(compound_head(page)),
+ next_node(page_to_nid(page), dst));
+ }
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0c4505b..5f5dfa5 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -290,27 +290,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}
-
-struct page *alloc_migrate_target(struct page *page, unsigned long private,
- int **resultp)
-{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
- /*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
- */
- if (PageHuge(page)) {
- nodemask_t src = nodemask_of_node(page_to_nid(page));
- nodemask_t dst;
- nodes_complement(dst, src);
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node(page_to_nid(page), dst));
- }
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- return alloc_page(gfp_mask);
-}
--
1.9.1
On Mon, Apr 27, 2015 at 04:23:39PM +0900, Joonsoo Kim wrote:
> When we steal whole pageblock, we don't need to break highest order
> freepage. Perhaps, there is small order freepage so we can use it.
>
The reason why the largest block is taken is to reduce the probability
there will be another fallback event in the near future. Early on, there
were a lot of tests conducted to measure the number of external fragmenting
events and take steps to reduce them. Stealing the largest highest order
freepage was one of those steps.
--
Mel Gorman
SUSE Labs
On Mon, Apr 27, 2015 at 04:23:41PM +0900, Joonsoo Kim wrote:
> We already have antifragmentation policy in page allocator. It works well
> when system memory is sufficient, but, it doesn't works well when system
> memory isn't sufficient because memory is already highly fragmented and
> fallback/steal mechanism cannot get whole pageblock. If there is severe
> unmovable allocation requestor like zram, problem could get worse.
>
> CPU: 8
> RAM: 512 MB with zram swap
> WORKLOAD: kernel build with -j12
> OPTION: page owner is enabled to measure fragmentation
> After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
>
> * Before
> Number of blocks type (movable)
> DMA32: 207
>
> Number of mixed blocks (movable)
> DMA32: 111.2
>
> Mixed blocks means that there is one or more allocated page for
> unmovable/reclaimable allocation in movable pageblock. Results shows that
> more than half of movable pageblock is tainted by other migratetype
> allocation.
>
> To mitigate this fragmentation, this patch implements active
> anti-fragmentation algorithm. Idea is really simple. When some
> unmovable/reclaimable steal happens from movable pageblock, we try to
> migrate out other pages that can be migratable in this pageblock are and
> use these generated freepage for further allocation request of
> corresponding migratetype.
>
> Once unmovable allocation taints movable pageblock, it cannot easily
> recover. Instead of praying that it gets restored, making it unmovable
> pageblock as much as possible and using it further unmovable request
> would be more reasonable approach.
>
> Below is result of this idea.
>
> * After
> Number of blocks type (movable)
> DMA32: 208.2
>
> Number of mixed blocks (movable)
> DMA32: 55.8
>
> Result shows that non-mixed block increase by 59% in this case.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
I haven't read the patch in detail but there were a few reasons why
active avoidance was not implemented originally.
1. If pages in the target block were reclaimed then it potentially
increased stall latency in the future when they had to be refaulted
again. A prototype that used lumpy reclaim originally suffered extreme
stalls and was ultimately abandoned. The alternative at the time was
to increase min_free_kbytes by default as it had a similar effect with
much less disruption
2. If the pages in the target block were migrated then there was
compaction overhead with no guarantee of success. Again, there were
concerns about stalls. This was not deferred to an external thread
because if the fragmenting process did not stall then it could simply
cause more fragmentation-related damage while the thread executes. It
becomes very unpredictable. While migration is in progress, processes
also potentially stall if they reference the targetted pages.
3. Further on 2, the migration itself potentially triggers more fallback
events while pages are isolated for the migration.
4. Migrating pages to another node is a bad idea. It requires a NUMA
machine at the very least but more importantly it could violate memory
policies. If the page was mapped then the VMA could be checked but if the
pages were unmapped then the kernel potentially violates memory policies
At the time it was implemented, fragmentation avoidance was primarily
concerned about allocating hugetlbfs pages and later THP. Failing either
was not a functional failure that users would care about but large stalls
due to active fragmentation avoidance would disrupt workloads badly.
Just be sure to take the stalling and memory policy problems into
account.
--
Mel Gorman
SUSE Labs
On Mon, Apr 27, 2015 at 09:08:50AM +0100, Mel Gorman wrote:
> On Mon, Apr 27, 2015 at 04:23:39PM +0900, Joonsoo Kim wrote:
> > When we steal whole pageblock, we don't need to break highest order
> > freepage. Perhaps, there is small order freepage so we can use it.
> >
>
> The reason why the largest block is taken is to reduce the probability
> there will be another fallback event in the near future. Early on, there
> were a lot of tests conducted to measure the number of external fragmenting
> events and take steps to reduce them. Stealing the largest highest order
> freepage was one of those steps.
Hello, Mel.
Purpose of this patch is not "stop steal highest order freepage".
Currently, in case of that we steal all freepage including highest
order freepage in certain pageblock, we break highest order freepage and
return it even if we have low order freepage that we immediately steal.
For example,
Pageblock A has 5 freepage (4 * order 0, 1 * order 3) and
we try to steal all freepage on pageblock A.
Withouth this patch, we move all freepage to requested migratetype
buddy list and break order 3 freepage. Leftover is like as following.
(5 * order 0, 1 * order 1, 1* order 2)
With this patch, (3 * order 0, 1 * order 3) remains.
I think that this is better than before because we still have high order
page. Isn't it?
Thanks.
On Mon, Apr 27, 2015 at 09:29:23AM +0100, Mel Gorman wrote:
> On Mon, Apr 27, 2015 at 04:23:41PM +0900, Joonsoo Kim wrote:
> > We already have antifragmentation policy in page allocator. It works well
> > when system memory is sufficient, but, it doesn't works well when system
> > memory isn't sufficient because memory is already highly fragmented and
> > fallback/steal mechanism cannot get whole pageblock. If there is severe
> > unmovable allocation requestor like zram, problem could get worse.
> >
> > CPU: 8
> > RAM: 512 MB with zram swap
> > WORKLOAD: kernel build with -j12
> > OPTION: page owner is enabled to measure fragmentation
> > After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
> >
> > * Before
> > Number of blocks type (movable)
> > DMA32: 207
> >
> > Number of mixed blocks (movable)
> > DMA32: 111.2
> >
> > Mixed blocks means that there is one or more allocated page for
> > unmovable/reclaimable allocation in movable pageblock. Results shows that
> > more than half of movable pageblock is tainted by other migratetype
> > allocation.
> >
> > To mitigate this fragmentation, this patch implements active
> > anti-fragmentation algorithm. Idea is really simple. When some
> > unmovable/reclaimable steal happens from movable pageblock, we try to
> > migrate out other pages that can be migratable in this pageblock are and
> > use these generated freepage for further allocation request of
> > corresponding migratetype.
> >
> > Once unmovable allocation taints movable pageblock, it cannot easily
> > recover. Instead of praying that it gets restored, making it unmovable
> > pageblock as much as possible and using it further unmovable request
> > would be more reasonable approach.
> >
> > Below is result of this idea.
> >
> > * After
> > Number of blocks type (movable)
> > DMA32: 208.2
> >
> > Number of mixed blocks (movable)
> > DMA32: 55.8
> >
> > Result shows that non-mixed block increase by 59% in this case.
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
>
> I haven't read the patch in detail but there were a few reasons why
> active avoidance was not implemented originally.
Thanks for really good comment. I can understand why it is in current
form from your comment and what I should consider.
>
> 1. If pages in the target block were reclaimed then it potentially
> increased stall latency in the future when they had to be refaulted
> again. A prototype that used lumpy reclaim originally suffered extreme
> stalls and was ultimately abandoned. The alternative at the time was
> to increase min_free_kbytes by default as it had a similar effect with
> much less disruption
Reclaim is not used by this patchset.
> 2. If the pages in the target block were migrated then there was
> compaction overhead with no guarantee of success. Again, there were
> concerns about stalls. This was not deferred to an external thread
> because if the fragmenting process did not stall then it could simply
> cause more fragmentation-related damage while the thread executes. It
Yes, this patch uses migration for active fragmentation avoidance.
But, I'm not sure why external thread approach could simply cause more
fragmentation-related damage. It cannot possibly follow-up allocation
speed of fragmenting process, but, even in this case, fragmentation
would be lower compared to do nothing approach.
> becomes very unpredictable. While migration is in progress, processes
> also potentially stall if they reference the targetted pages.
I should admit that if processes reference the targetted pages, they
would stall and there is compaction overhead with no guarantee of
success. But, this fragmentation avoidance is really needed for low
memory system, because, in that system, fragmentation could be really high
so even order 2 allocation suffers from fragmentation. File pages are
excessively reclaimed to make order 2 page. I think that this reclaim
overhead is worse than above overhead causing by this fragmentation
avoidance algorithm.
> 3. Further on 2, the migration itself potentially triggers more fallback
> events while pages are isolated for the migration.
>
> 4. Migrating pages to another node is a bad idea. It requires a NUMA
> machine at the very least but more importantly it could violate memory
> policies. If the page was mapped then the VMA could be checked but if the
> pages were unmapped then the kernel potentially violates memory policies
I agree. Migrating pages to another node is not intended behaviour.
I will fix it.
> At the time it was implemented, fragmentation avoidance was primarily
> concerned about allocating hugetlbfs pages and later THP. Failing either
> was not a functional failure that users would care about but large stalls
> due to active fragmentation avoidance would disrupt workloads badly.
I see. But, my attempt of this patchset is kind of functional failure.
If unmovable pages are mixed to movable pages in movable pageblock,
even small order allocation isn't processed easily in highly
fragmented low memory system.
> Just be sure to take the stalling and memory policy problems into
> account.
Okay. This versioned patch doesn't consider stalling so it isolates whole
migratable pages in pageblock all at once. :)
I will fix it in next spin.
And, ditto for memory policy.
Thanks.
On 04/27/2015 09:23 AM, Joonsoo Kim wrote:
> When we steal whole pageblock, we don't need to break highest order
> freepage. Perhaps, there is small order freepage so we can use it.
>
> This also gives us some code size reduction because expand() which
> is used in __rmqueue_fallback() and inlined into __rmqueue_fallback()
> is removed.
>
> text data bss dec hex filename
> 37413 1440 624 39477 9a35 mm/page_alloc.o
> 37249 1440 624 39313 9991 mm/page_alloc.o
>
> Signed-off-by: Joonsoo Kim <[email protected]>
> ---
> mm/page_alloc.c | 40 +++++++++++++++++++++-------------------
> 1 file changed, 21 insertions(+), 19 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index ed0f1c6..044f16c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1239,14 +1239,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
> }
>
> /* Remove an element from the buddy allocator from the fallback list */
This is no longer accurate description.
> -static inline struct page *
> +static inline bool
> __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> {
> struct free_area *area;
> unsigned int current_order;
> struct page *page;
> int fallback_mt;
> - bool can_steal;
> + bool can_steal_pageblock;
>
> /* Find the largest possible block of pages in the other list */
> for (current_order = MAX_ORDER-1;
> @@ -1254,26 +1254,24 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> --current_order) {
> area = &(zone->free_area[current_order]);
> fallback_mt = find_suitable_fallback(area, current_order,
> - start_migratetype, false, &can_steal);
> + start_migratetype, false,
> + &can_steal_pageblock);
> if (fallback_mt == -1)
> continue;
>
> page = list_entry(area->free_list[fallback_mt].next,
> struct page, lru);
> - if (can_steal)
> + BUG_ON(!page);
Please no new BUG_ON. VM_BUG_ON maybe for debugging, otherwise just let
it panic on null pointer exception accessing page->lru later on.
> +
> + if (can_steal_pageblock)
> steal_suitable_fallback(zone, page, start_migratetype);
>
> - /* Remove the page from the freelists */
> - area->nr_free--;
> - list_del(&page->lru);
> - rmv_page_order(page);
> + list_move(&page->lru, &area->free_list[start_migratetype]);
This list_move is redundant if we are stealing whole pageblock, right?
Just put it in an else of the if above, and explain in comment?
> - expand(zone, page, order, current_order, area,
> - start_migratetype);
> /*
> * The freepage_migratetype may differ from pageblock's
> * migratetype depending on the decisions in
> - * try_to_steal_freepages(). This is OK as long as it
> + * find_suitable_fallback(). This is OK as long as it
> * does not differ for MIGRATE_CMA pageblocks. For CMA
> * we need to make sure unallocated pages flushed from
> * pcp lists are returned to the correct freelist.
The whole thing with set_freepage_migratetype(page, start_migratetype);
below this comment is now redundant, as rmqueue_smallest will do it too.
The comment itself became outdated and misplaced too. I guess
MIGRATE_CMA is now handled just by the fact that is is not set as
fallback in the fallbacks array?
> @@ -1283,10 +1281,10 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> trace_mm_page_alloc_extfrag(page, order, current_order,
> start_migratetype, fallback_mt);
>
> - return page;
> + return true;
> }
>
> - return NULL;
> + return false;
> }
>
> /*
> @@ -1297,28 +1295,32 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
> int migratetype)
> {
> struct page *page;
> + bool steal_fallback;
>
> -retry_reserve:
> +retry:
> page = __rmqueue_smallest(zone, order, migratetype);
>
> if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
> if (migratetype == MIGRATE_MOVABLE)
> page = __rmqueue_cma_fallback(zone, order);
>
> - if (!page)
> - page = __rmqueue_fallback(zone, order, migratetype);
> + if (page)
> + goto out;
> +
> + steal_fallback = __rmqueue_fallback(zone, order, migratetype);
>
> /*
> * Use MIGRATE_RESERVE rather than fail an allocation. goto
> * is used because __rmqueue_smallest is an inline function
> * and we want just one call site
> */
> - if (!page) {
> + if (!steal_fallback)
> migratetype = MIGRATE_RESERVE;
> - goto retry_reserve;
> - }
> +
> + goto retry;
> }
>
> +out:
> trace_mm_page_alloc_zone_locked(page, order, migratetype);
> return page;
> }
>
On 05/12/2015 09:51 AM, Vlastimil Babka wrote:
>> {
>> struct page *page;
>> + bool steal_fallback;
>>
>> -retry_reserve:
>> +retry:
>> page = __rmqueue_smallest(zone, order, migratetype);
>>
>> if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
>> if (migratetype == MIGRATE_MOVABLE)
>> page = __rmqueue_cma_fallback(zone, order);
>>
>> - if (!page)
>> - page = __rmqueue_fallback(zone, order, migratetype);
>> + if (page)
>> + goto out;
>> +
>> + steal_fallback = __rmqueue_fallback(zone, order, migratetype);
Oh and the variable can be probably replaced by calling
__rmqueue_fallback directly in the if() below.
>>
>> /*
>> * Use MIGRATE_RESERVE rather than fail an allocation. goto
>> * is used because __rmqueue_smallest is an inline function
>> * and we want just one call site
>> */
>> - if (!page) {
>> + if (!steal_fallback)
>> migratetype = MIGRATE_RESERVE;
>> - goto retry_reserve;
>> - }
>> +
>> + goto retry;
>> }
>>
>> +out:
>> trace_mm_page_alloc_zone_locked(page, order, migratetype);
>> return page;
>> }
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On 04/27/2015 10:42 AM, Joonsoo Kim wrote:
> On Mon, Apr 27, 2015 at 09:08:50AM +0100, Mel Gorman wrote:
>> On Mon, Apr 27, 2015 at 04:23:39PM +0900, Joonsoo Kim wrote:
>>> When we steal whole pageblock, we don't need to break highest order
>>> freepage. Perhaps, there is small order freepage so we can use it.
>>>
>>
>> The reason why the largest block is taken is to reduce the probability
>> there will be another fallback event in the near future. Early on, there
>> were a lot of tests conducted to measure the number of external fragmenting
>> events and take steps to reduce them. Stealing the largest highest order
>> freepage was one of those steps.
>
> Hello, Mel.
>
> Purpose of this patch is not "stop steal highest order freepage".
> Currently, in case of that we steal all freepage including highest
> order freepage in certain pageblock, we break highest order freepage and
> return it even if we have low order freepage that we immediately steal.
>
> For example,
>
> Pageblock A has 5 freepage (4 * order 0, 1 * order 3) and
> we try to steal all freepage on pageblock A.
>
> Withouth this patch, we move all freepage to requested migratetype
> buddy list and break order 3 freepage. Leftover is like as following.
>
> (5 * order 0, 1 * order 1, 1* order 2)
>
> With this patch, (3 * order 0, 1 * order 3) remains.
>
> I think that this is better than before because we still have high order
> page. Isn't it?
I agree that this should be better in some cases and shouldn't be worse
in any case. Nice catch.
> Thanks.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On 04/27/2015 09:23 AM, Joonsoo Kim wrote:
> Sometimes we try to get more freepages from buddy list than how much
> we really need, in order to refill pcp list. This may speed up following
> allocation request, but, there is a possibility to increase fragmentation
> if we steal freepages from other migratetype buddy list excessively. This
> patch changes this behaviour to stop fallback allocation in order to
> reduce fragmentation if we already get some freepages.
>
> CPU: 8
> RAM: 512 MB with zram swap
> WORKLOAD: kernel build with -j12
> OPTION: page owner is enabled to measure fragmentation
> After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
>
> * Before
> Number of blocks type (movable)
> DMA32: 208.4
>
> Number of mixed blocks (movable)
> DMA32: 139
>
> Mixed blocks means that there is one or more allocated page for
> unmovable/reclaimable allocation in movable pageblock. Results shows that
> more than half of movable pageblock is tainted by other migratetype
> allocation.
>
> * After
> Number of blocks type (movable)
> DMA32: 207
>
> Number of mixed blocks (movable)
> DMA32: 111.2
>
> This result shows that non-mixed block increase by 38% in this case.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
I agree that keeping fragmentation low is more important than filling up
the pcplists. Wouldn't expect such large difference though. Are the
results stable?
> ---
> mm/page_alloc.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 044f16c..fbe2211 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1292,7 +1292,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> * Call me with the zone->lock already held.
> */
> static struct page *__rmqueue(struct zone *zone, unsigned int order,
> - int migratetype)
> + int migratetype, int index)
> {
> struct page *page;
> bool steal_fallback;
> @@ -1301,6 +1301,10 @@ retry:
> page = __rmqueue_smallest(zone, order, migratetype);
>
> if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
> + /* We already get some freepages so don't do agressive steal */
> + if (index != 0)
> + goto out;
> +
> if (migratetype == MIGRATE_MOVABLE)
> page = __rmqueue_cma_fallback(zone, order);
>
> @@ -1338,7 +1342,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>
> spin_lock(&zone->lock);
> for (i = 0; i < count; ++i) {
> - struct page *page = __rmqueue(zone, order, migratetype);
> + struct page *page = __rmqueue(zone, order, migratetype, i);
> if (unlikely(page == NULL))
> break;
>
> @@ -1749,7 +1753,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
> WARN_ON_ONCE(order > 1);
> }
> spin_lock_irqsave(&zone->lock, flags);
> - page = __rmqueue(zone, order, migratetype);
> + page = __rmqueue(zone, order, migratetype, 0);
> spin_unlock(&zone->lock);
> if (!page)
> goto failed;
>
On 04/28/2015 09:45 AM, Joonsoo Kim wrote:
> On Mon, Apr 27, 2015 at 09:29:23AM +0100, Mel Gorman wrote:
>> On Mon, Apr 27, 2015 at 04:23:41PM +0900, Joonsoo Kim wrote:
>>> We already have antifragmentation policy in page allocator. It works well
>>> when system memory is sufficient, but, it doesn't works well when system
>>> memory isn't sufficient because memory is already highly fragmented and
>>> fallback/steal mechanism cannot get whole pageblock. If there is severe
>>> unmovable allocation requestor like zram, problem could get worse.
>>>
>>> CPU: 8
>>> RAM: 512 MB with zram swap
>>> WORKLOAD: kernel build with -j12
>>> OPTION: page owner is enabled to measure fragmentation
>>> After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
>>>
>>> * Before
>>> Number of blocks type (movable)
>>> DMA32: 207
>>>
>>> Number of mixed blocks (movable)
>>> DMA32: 111.2
>>>
>>> Mixed blocks means that there is one or more allocated page for
>>> unmovable/reclaimable allocation in movable pageblock. Results shows that
>>> more than half of movable pageblock is tainted by other migratetype
>>> allocation.
>>>
>>> To mitigate this fragmentation, this patch implements active
>>> anti-fragmentation algorithm. Idea is really simple. When some
>>> unmovable/reclaimable steal happens from movable pageblock, we try to
>>> migrate out other pages that can be migratable in this pageblock are and
>>> use these generated freepage for further allocation request of
>>> corresponding migratetype.
>>>
>>> Once unmovable allocation taints movable pageblock, it cannot easily
>>> recover. Instead of praying that it gets restored, making it unmovable
>>> pageblock as much as possible and using it further unmovable request
>>> would be more reasonable approach.
>>>
>>> Below is result of this idea.
>>>
>>> * After
>>> Number of blocks type (movable)
>>> DMA32: 208.2
>>>
>>> Number of mixed blocks (movable)
>>> DMA32: 55.8
>>>
>>> Result shows that non-mixed block increase by 59% in this case.
Interesting. I tested a patch prototype like this too (although the work
wasn't offloaded to a kthread, I wanted to see benefits first) and it
yielded no significant difference. But admittedly I was using
stress-highalloc for huge page sized allocations and a 4GB memory system...
So with these results it seems definitely worth pursuing, taking Mel's
comments into account. We should think about coordination with
khugepaged, which is another source of compaction. See my patchset from
yesterday "Outsourcing page fault THP allocations to khugepaged" (sorry
I didn't CC you). I think ideally this "antifrag" or maybe "kcompactd"
thread would be one per NUMA node and serve both for the pageblock
antifragmentation requests (with higher priority) and then THP
allocation requests. Then khugepaged would do just the scanning for
collapses, which might be later moved to task_work context and
khugepaged killed. We could also remove compaction from kswapd balancing
and let it wake up kcompactd instead.
On Tue, May 12, 2015 at 09:51:56AM +0200, Vlastimil Babka wrote:
> On 04/27/2015 09:23 AM, Joonsoo Kim wrote:
> >When we steal whole pageblock, we don't need to break highest order
> >freepage. Perhaps, there is small order freepage so we can use it.
> >
> >This also gives us some code size reduction because expand() which
> >is used in __rmqueue_fallback() and inlined into __rmqueue_fallback()
> >is removed.
> >
> > text data bss dec hex filename
> > 37413 1440 624 39477 9a35 mm/page_alloc.o
> > 37249 1440 624 39313 9991 mm/page_alloc.o
> >
> >Signed-off-by: Joonsoo Kim <[email protected]>
> >---
> > mm/page_alloc.c | 40 +++++++++++++++++++++-------------------
> > 1 file changed, 21 insertions(+), 19 deletions(-)
> >
> >diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >index ed0f1c6..044f16c 100644
> >--- a/mm/page_alloc.c
> >+++ b/mm/page_alloc.c
> >@@ -1239,14 +1239,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
> > }
> >
> > /* Remove an element from the buddy allocator from the fallback list */
>
> This is no longer accurate description.
Okay. Will fix.
>
> >-static inline struct page *
> >+static inline bool
> > __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> > {
> > struct free_area *area;
> > unsigned int current_order;
> > struct page *page;
> > int fallback_mt;
> >- bool can_steal;
> >+ bool can_steal_pageblock;
> >
> > /* Find the largest possible block of pages in the other list */
> > for (current_order = MAX_ORDER-1;
> >@@ -1254,26 +1254,24 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
> > --current_order) {
> > area = &(zone->free_area[current_order]);
> > fallback_mt = find_suitable_fallback(area, current_order,
> >- start_migratetype, false, &can_steal);
> >+ start_migratetype, false,
> >+ &can_steal_pageblock);
> > if (fallback_mt == -1)
> > continue;
> >
> > page = list_entry(area->free_list[fallback_mt].next,
> > struct page, lru);
>
> >- if (can_steal)
> >+ BUG_ON(!page);
>
> Please no new BUG_ON. VM_BUG_ON maybe for debugging, otherwise just
> let it panic on null pointer exception accessing page->lru later on.
Okay. I will remove it.
> >+
> >+ if (can_steal_pageblock)
> > steal_suitable_fallback(zone, page, start_migratetype);
> >
> >- /* Remove the page from the freelists */
> >- area->nr_free--;
> >- list_del(&page->lru);
> >- rmv_page_order(page);
> >+ list_move(&page->lru, &area->free_list[start_migratetype]);
>
> This list_move is redundant if we are stealing whole pageblock,
> right? Just put it in an else of the if above, and explain in
> comment?
I tried to put list_move() in an else of the if above and I got a
panic problem due to failure of stealing whole pageblock. In that
time, I didn't want to deep dive into the problem so used a simple
solution like as above. But, now I find the reason why steal sometimes
fail so I can fix it. I will fix it on next spin.
>
> >- expand(zone, page, order, current_order, area,
> >- start_migratetype);
> > /*
> > * The freepage_migratetype may differ from pageblock's
> > * migratetype depending on the decisions in
> >- * try_to_steal_freepages(). This is OK as long as it
> >+ * find_suitable_fallback(). This is OK as long as it
> > * does not differ for MIGRATE_CMA pageblocks. For CMA
> > * we need to make sure unallocated pages flushed from
> > * pcp lists are returned to the correct freelist.
>
> The whole thing with set_freepage_migratetype(page,
> start_migratetype); below this comment is now redundant, as
> rmqueue_smallest will do it too.
> The comment itself became outdated and misplaced too. I guess
> MIGRATE_CMA is now handled just by the fact that is is not set as
> fallback in the fallbacks array?
Okay. All comment seems to be redundant. I will remove it.
Thanks.
On Tue, May 12, 2015 at 09:54:51AM +0200, Vlastimil Babka wrote:
> On 05/12/2015 09:51 AM, Vlastimil Babka wrote:
> >> {
> >> struct page *page;
> >>+ bool steal_fallback;
> >>
> >>-retry_reserve:
> >>+retry:
> >> page = __rmqueue_smallest(zone, order, migratetype);
> >>
> >> if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
> >> if (migratetype == MIGRATE_MOVABLE)
> >> page = __rmqueue_cma_fallback(zone, order);
> >>
> >>- if (!page)
> >>- page = __rmqueue_fallback(zone, order, migratetype);
> >>+ if (page)
> >>+ goto out;
> >>+
> >>+ steal_fallback = __rmqueue_fallback(zone, order, migratetype);
>
> Oh and the variable can be probably replaced by calling
> __rmqueue_fallback directly in the if() below.
Will do.
Thanks.
On Tue, May 12, 2015 at 10:36:40AM +0200, Vlastimil Babka wrote:
> On 04/27/2015 09:23 AM, Joonsoo Kim wrote:
> >Sometimes we try to get more freepages from buddy list than how much
> >we really need, in order to refill pcp list. This may speed up following
> >allocation request, but, there is a possibility to increase fragmentation
> >if we steal freepages from other migratetype buddy list excessively. This
> >patch changes this behaviour to stop fallback allocation in order to
> >reduce fragmentation if we already get some freepages.
> >
> >CPU: 8
> >RAM: 512 MB with zram swap
> >WORKLOAD: kernel build with -j12
> >OPTION: page owner is enabled to measure fragmentation
> >After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
> >
> >* Before
> >Number of blocks type (movable)
> >DMA32: 208.4
> >
> >Number of mixed blocks (movable)
> >DMA32: 139
> >
> >Mixed blocks means that there is one or more allocated page for
> >unmovable/reclaimable allocation in movable pageblock. Results shows that
> >more than half of movable pageblock is tainted by other migratetype
> >allocation.
> >
> >* After
> >Number of blocks type (movable)
> >DMA32: 207
> >
> >Number of mixed blocks (movable)
> >DMA32: 111.2
> >
> >This result shows that non-mixed block increase by 38% in this case.
> >
> >Signed-off-by: Joonsoo Kim <[email protected]>
>
> I agree that keeping fragmentation low is more important than
> filling up the pcplists. Wouldn't expect such large difference
> though. Are the results stable?
Yes, I think that improvement is very stable. stdev is roughly 8 in 5 runs.
Thanks.
On Tue, May 12, 2015 at 11:01:48AM +0200, Vlastimil Babka wrote:
> On 04/28/2015 09:45 AM, Joonsoo Kim wrote:
> >On Mon, Apr 27, 2015 at 09:29:23AM +0100, Mel Gorman wrote:
> >>On Mon, Apr 27, 2015 at 04:23:41PM +0900, Joonsoo Kim wrote:
> >>>We already have antifragmentation policy in page allocator. It works well
> >>>when system memory is sufficient, but, it doesn't works well when system
> >>>memory isn't sufficient because memory is already highly fragmented and
> >>>fallback/steal mechanism cannot get whole pageblock. If there is severe
> >>>unmovable allocation requestor like zram, problem could get worse.
> >>>
> >>>CPU: 8
> >>>RAM: 512 MB with zram swap
> >>>WORKLOAD: kernel build with -j12
> >>>OPTION: page owner is enabled to measure fragmentation
> >>>After finishing the build, check fragmentation by 'cat /proc/pagetypeinfo'
> >>>
> >>>* Before
> >>>Number of blocks type (movable)
> >>>DMA32: 207
> >>>
> >>>Number of mixed blocks (movable)
> >>>DMA32: 111.2
> >>>
> >>>Mixed blocks means that there is one or more allocated page for
> >>>unmovable/reclaimable allocation in movable pageblock. Results shows that
> >>>more than half of movable pageblock is tainted by other migratetype
> >>>allocation.
> >>>
> >>>To mitigate this fragmentation, this patch implements active
> >>>anti-fragmentation algorithm. Idea is really simple. When some
> >>>unmovable/reclaimable steal happens from movable pageblock, we try to
> >>>migrate out other pages that can be migratable in this pageblock are and
> >>>use these generated freepage for further allocation request of
> >>>corresponding migratetype.
> >>>
> >>>Once unmovable allocation taints movable pageblock, it cannot easily
> >>>recover. Instead of praying that it gets restored, making it unmovable
> >>>pageblock as much as possible and using it further unmovable request
> >>>would be more reasonable approach.
> >>>
> >>>Below is result of this idea.
> >>>
> >>>* After
> >>>Number of blocks type (movable)
> >>>DMA32: 208.2
> >>>
> >>>Number of mixed blocks (movable)
> >>>DMA32: 55.8
> >>>
> >>>Result shows that non-mixed block increase by 59% in this case.
>
> Interesting. I tested a patch prototype like this too (although the
> work wasn't offloaded to a kthread, I wanted to see benefits first)
> and it yielded no significant difference. But admittedly I was using
> stress-highalloc for huge page sized allocations and a 4GB memory
> system...
Okay.
>
> So with these results it seems definitely worth pursuing, taking
> Mel's comments into account. We should think about coordination with
> khugepaged, which is another source of compaction. See my patchset
> from yesterday "Outsourcing page fault THP allocations to
> khugepaged" (sorry I didn't CC you). I think ideally this "antifrag"
I will check it.
> or maybe "kcompactd" thread would be one per NUMA node and serve
> both for the pageblock antifragmentation requests (with higher
Before, I tried an idea that create one kantifragd per node. Sometimes,
anti-fragmentation requests are crushed into the thread so the thread
can't handle it in time. With using workqueue, I can spread the work
to all cpus so this problem is reduced. But, it's the policy that
how we spend our time for anti-fragmentation work so one thread
per node would be enough.
Thanks.