In the situation where direct reclaim is required to make progress for
compaction but no_progress_loops is already over the limit of
MAX_RECLAIM_RETRIES consider invoking the oom killer.
Signed-off-by: Aaron Tomlin <[email protected]>
---
mm/page_alloc.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a2c89b21115..8d748b1b8d1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4181,6 +4181,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
}
+static inline bool
+should_try_oom(int no_progress_loops,
+ enum compact_result last_compact_result)
+{
+ if (no_progress_loops > MAX_RECLAIM_RETRIES && last_compact_result
+ == COMPACT_SKIPPED)
+ return true;
+ return false;
+}
+
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
@@ -4547,10 +4557,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
- /* Before OOM, exhaust highatomic_reserve */
- return unreserve_highatomic_pageblock(ac, true);
- }
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES)
+ result false;
+ /* Last chance before OOM, try draining highatomic_reserve once */
+ else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
+ return unreserve_highatomic_pageblock(ac, true)
/*
* Keep reclaiming pages while there is a chance this will lead
@@ -4822,6 +4833,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
did_some_progress > 0, &no_progress_loops))
goto retry;
+ if (should_try_oom(no_progress_loops, compact_result))
+ goto oom:
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
@@ -4839,6 +4852,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
+oom:
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
--
2.26.2
Hi Aaron,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on hnaz-linux-mm/master]
url: https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base: https://github.com/hnaz/linux-mm master
config: arc-randconfig-r024-20210315 (attached as .config)
compiler: arceb-elf-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
git checkout 77338aaff2606a7715c832545e79370e849e3b4e
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arc
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All error/warnings (new ones prefixed by >>):
mm/page_alloc.c: In function 'should_reclaim_retry':
>> mm/page_alloc.c:4444:3: error: 'result' undeclared (first use in this function)
4444 | result false;
| ^~~~~~
mm/page_alloc.c:4444:3: note: each undeclared identifier is reported only once for each function it appears in
>> mm/page_alloc.c:4444:9: error: expected ';' before 'false'
4444 | result false;
| ^~~~~~
| ;
>> mm/page_alloc.c:4447:50: error: expected ';' before 'for'
4447 | return unreserve_highatomic_pageblock(ac, true)
| ^
| ;
mm/page_alloc.c:4426:18: warning: unused variable 'z' [-Wunused-variable]
4426 | struct zoneref *z;
| ^
mm/page_alloc.c:4425:15: warning: unused variable 'zone' [-Wunused-variable]
4425 | struct zone *zone;
| ^~~~
mm/page_alloc.c: In function '__alloc_pages_slowpath':
>> mm/page_alloc.c:4720:11: error: expected ';' before ':' token
4720 | goto oom:
| ^
| ;
>> mm/page_alloc.c:4556:6: warning: variable 'compaction_retries' set but not used [-Wunused-but-set-variable]
4556 | int compaction_retries;
| ^~~~~~~~~~~~~~~~~~
mm/page_alloc.c: At top level:
mm/page_alloc.c:6136:23: warning: no previous prototype for 'memmap_init' [-Wmissing-prototypes]
6136 | void __meminit __weak memmap_init(unsigned long size, int nid,
| ^~~~~~~~~~~
vim +/result +4444 mm/page_alloc.c
4409
4410 /*
4411 * Checks whether it makes sense to retry the reclaim to make a forward progress
4412 * for the given allocation request.
4413 *
4414 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4415 * without success, or when we couldn't even meet the watermark if we
4416 * reclaimed all remaining pages on the LRU lists.
4417 *
4418 * Returns true if a retry is viable or false to enter the oom path.
4419 */
4420 static inline bool
4421 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4422 struct alloc_context *ac, int alloc_flags,
4423 bool did_some_progress, int *no_progress_loops)
4424 {
4425 struct zone *zone;
4426 struct zoneref *z;
4427 bool ret = false;
4428
4429 /*
4430 * Costly allocations might have made a progress but this doesn't mean
4431 * their order will become available due to high fragmentation so
4432 * always increment the no progress counter for them
4433 */
4434 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4435 *no_progress_loops = 0;
4436 else
4437 (*no_progress_loops)++;
4438
4439 /*
4440 * Make sure we converge to OOM if we cannot make any progress
4441 * several times in the row.
4442 */
4443 if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> 4444 result false;
4445 /* Last chance before OOM, try draining highatomic_reserve once */
4446 else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> 4447 return unreserve_highatomic_pageblock(ac, true)
4448
4449 /*
4450 * Keep reclaiming pages while there is a chance this will lead
4451 * somewhere. If none of the target zones can satisfy our allocation
4452 * request even if all reclaimable pages are considered then we are
4453 * screwed and have to go OOM.
4454 */
4455 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4456 ac->highest_zoneidx, ac->nodemask) {
4457 unsigned long available;
4458 unsigned long reclaimable;
4459 unsigned long min_wmark = min_wmark_pages(zone);
4460 bool wmark;
4461
4462 available = reclaimable = zone_reclaimable_pages(zone);
4463 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
4464
4465 /*
4466 * Would the allocation succeed if we reclaimed all
4467 * reclaimable pages?
4468 */
4469 wmark = __zone_watermark_ok(zone, order, min_wmark,
4470 ac->highest_zoneidx, alloc_flags, available);
4471 trace_reclaim_retry_zone(z, order, reclaimable,
4472 available, min_wmark, *no_progress_loops, wmark);
4473 if (wmark) {
4474 /*
4475 * If we didn't make any progress and have a lot of
4476 * dirty + writeback pages then we should wait for
4477 * an IO to complete to slow down the reclaim and
4478 * prevent from pre mature OOM
4479 */
4480 if (!did_some_progress) {
4481 unsigned long write_pending;
4482
4483 write_pending = zone_page_state_snapshot(zone,
4484 NR_ZONE_WRITE_PENDING);
4485
4486 if (2 * write_pending > reclaimable) {
4487 congestion_wait(BLK_RW_ASYNC, HZ/10);
4488 return true;
4489 }
4490 }
4491
4492 ret = true;
4493 goto out;
4494 }
4495 }
4496
4497 out:
4498 /*
4499 * Memory allocation/reclaim might be called from a WQ context and the
4500 * current implementation of the WQ concurrency control doesn't
4501 * recognize that a particular WQ is congested if the worker thread is
4502 * looping without ever sleeping. Therefore we have to do a short sleep
4503 * here rather than calling cond_resched().
4504 */
4505 if (current->flags & PF_WQ_WORKER)
4506 schedule_timeout_uninterruptible(1);
4507 else
4508 cond_resched();
4509 return ret;
4510 }
4511
4512 static inline bool
4513 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4514 {
4515 /*
4516 * It's possible that cpuset's mems_allowed and the nodemask from
4517 * mempolicy don't intersect. This should be normally dealt with by
4518 * policy_nodemask(), but it's possible to race with cpuset update in
4519 * such a way the check therein was true, and then it became false
4520 * before we got our cpuset_mems_cookie here.
4521 * This assumes that for all allocations, ac->nodemask can come only
4522 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4523 * when it does not intersect with the cpuset restrictions) or the
4524 * caller can deal with a violated nodemask.
4525 */
4526 if (cpusets_enabled() && ac->nodemask &&
4527 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4528 ac->nodemask = NULL;
4529 return true;
4530 }
4531
4532 /*
4533 * When updating a task's mems_allowed or mempolicy nodemask, it is
4534 * possible to race with parallel threads in such a way that our
4535 * allocation can fail while the mask is being updated. If we are about
4536 * to fail, check if the cpuset changed during allocation and if so,
4537 * retry.
4538 */
4539 if (read_mems_allowed_retry(cpuset_mems_cookie))
4540 return true;
4541
4542 return false;
4543 }
4544
4545 static inline struct page *
4546 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4547 struct alloc_context *ac)
4548 {
4549 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4550 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4551 struct page *page = NULL;
4552 unsigned int alloc_flags;
4553 unsigned long did_some_progress;
4554 enum compact_priority compact_priority;
4555 enum compact_result compact_result;
> 4556 int compaction_retries;
4557 int no_progress_loops;
4558 unsigned int cpuset_mems_cookie;
4559 int reserve_flags;
4560
4561 /*
4562 * We also sanity check to catch abuse of atomic reserves being used by
4563 * callers that are not in atomic context.
4564 */
4565 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
4566 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
4567 gfp_mask &= ~__GFP_ATOMIC;
4568
4569 retry_cpuset:
4570 compaction_retries = 0;
4571 no_progress_loops = 0;
4572 compact_priority = DEF_COMPACT_PRIORITY;
4573 cpuset_mems_cookie = read_mems_allowed_begin();
4574
4575 /*
4576 * The fast path uses conservative alloc_flags to succeed only until
4577 * kswapd needs to be woken up, and to avoid the cost of setting up
4578 * alloc_flags precisely. So we do that now.
4579 */
4580 alloc_flags = gfp_to_alloc_flags(gfp_mask);
4581
4582 /*
4583 * We need to recalculate the starting point for the zonelist iterator
4584 * because we might have used different nodemask in the fast path, or
4585 * there was a cpuset modification and we are retrying - otherwise we
4586 * could end up iterating over non-eligible zones endlessly.
4587 */
4588 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4589 ac->highest_zoneidx, ac->nodemask);
4590 if (!ac->preferred_zoneref->zone)
4591 goto nopage;
4592
4593 if (alloc_flags & ALLOC_KSWAPD)
4594 wake_all_kswapds(order, gfp_mask, ac);
4595
4596 /*
4597 * The adjusted alloc_flags might result in immediate success, so try
4598 * that first
4599 */
4600 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4601 if (page)
4602 goto got_pg;
4603
4604 /*
4605 * For costly allocations, try direct compaction first, as it's likely
4606 * that we have enough base pages and don't need to reclaim. For non-
4607 * movable high-order allocations, do that as well, as compaction will
4608 * try prevent permanent fragmentation by migrating from blocks of the
4609 * same migratetype.
4610 * Don't try this for allocations that are allowed to ignore
4611 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4612 */
4613 if (can_direct_reclaim &&
4614 (costly_order ||
4615 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4616 && !gfp_pfmemalloc_allowed(gfp_mask)) {
4617 page = __alloc_pages_direct_compact(gfp_mask, order,
4618 alloc_flags, ac,
4619 INIT_COMPACT_PRIORITY,
4620 &compact_result);
4621 if (page)
4622 goto got_pg;
4623
4624 /*
4625 * Checks for costly allocations with __GFP_NORETRY, which
4626 * includes some THP page fault allocations
4627 */
4628 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4629 /*
4630 * If allocating entire pageblock(s) and compaction
4631 * failed because all zones are below low watermarks
4632 * or is prohibited because it recently failed at this
4633 * order, fail immediately unless the allocator has
4634 * requested compaction and reclaim retry.
4635 *
4636 * Reclaim is
4637 * - potentially very expensive because zones are far
4638 * below their low watermarks or this is part of very
4639 * bursty high order allocations,
4640 * - not guaranteed to help because isolate_freepages()
4641 * may not iterate over freed pages as part of its
4642 * linear scan, and
4643 * - unlikely to make entire pageblocks free on its
4644 * own.
4645 */
4646 if (compact_result == COMPACT_SKIPPED ||
4647 compact_result == COMPACT_DEFERRED)
4648 goto nopage;
4649
4650 /*
4651 * Looks like reclaim/compaction is worth trying, but
4652 * sync compaction could be very expensive, so keep
4653 * using async compaction.
4654 */
4655 compact_priority = INIT_COMPACT_PRIORITY;
4656 }
4657 }
4658
4659 retry:
4660 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4661 if (alloc_flags & ALLOC_KSWAPD)
4662 wake_all_kswapds(order, gfp_mask, ac);
4663
4664 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4665 if (reserve_flags)
4666 alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4667
4668 /*
4669 * Reset the nodemask and zonelist iterators if memory policies can be
4670 * ignored. These allocations are high priority and system rather than
4671 * user oriented.
4672 */
4673 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4674 ac->nodemask = NULL;
4675 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4676 ac->highest_zoneidx, ac->nodemask);
4677 }
4678
4679 /* Attempt with potentially adjusted zonelist and alloc_flags */
4680 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4681 if (page)
4682 goto got_pg;
4683
4684 /* Caller is not willing to reclaim, we can't balance anything */
4685 if (!can_direct_reclaim)
4686 goto nopage;
4687
4688 /* Avoid recursion of direct reclaim */
4689 if (current->flags & PF_MEMALLOC)
4690 goto nopage;
4691
4692 /* Try direct reclaim and then allocating */
4693 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4694 &did_some_progress);
4695 if (page)
4696 goto got_pg;
4697
4698 /* Try direct compaction and then allocating */
4699 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4700 compact_priority, &compact_result);
4701 if (page)
4702 goto got_pg;
4703
4704 /* Do not loop if specifically requested */
4705 if (gfp_mask & __GFP_NORETRY)
4706 goto nopage;
4707
4708 /*
4709 * Do not retry costly high order allocations unless they are
4710 * __GFP_RETRY_MAYFAIL
4711 */
4712 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4713 goto nopage;
4714
4715 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4716 did_some_progress > 0, &no_progress_loops))
4717 goto retry;
4718
4719 if (should_try_oom(no_progress_loops, compact_result))
4720 goto oom:
4721 /*
4722 * It doesn't make any sense to retry for the compaction if the order-0
4723 * reclaim is not able to make any progress because the current
4724 * implementation of the compaction depends on the sufficient amount
4725 * of free memory (see __compaction_suitable)
4726 */
4727 if (did_some_progress > 0 &&
4728 should_compact_retry(ac, order, alloc_flags,
4729 compact_result, &compact_priority,
4730 &compaction_retries))
4731 goto retry;
4732
4733
4734 /* Deal with possible cpuset update races before we start OOM killing */
4735 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4736 goto retry_cpuset;
4737
4738 oom:
4739 /* Reclaim has failed us, start killing things */
4740 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4741 if (page)
4742 goto got_pg;
4743
4744 /* Avoid allocations with no watermarks from looping endlessly */
4745 if (tsk_is_oom_victim(current) &&
4746 (alloc_flags & ALLOC_OOM ||
4747 (gfp_mask & __GFP_NOMEMALLOC)))
4748 goto nopage;
4749
4750 /* Retry as long as the OOM killer is making progress */
4751 if (did_some_progress) {
4752 no_progress_loops = 0;
4753 goto retry;
4754 }
4755
4756 nopage:
4757 /* Deal with possible cpuset update races before we fail */
4758 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4759 goto retry_cpuset;
4760
4761 /*
4762 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4763 * we always retry
4764 */
4765 if (gfp_mask & __GFP_NOFAIL) {
4766 /*
4767 * All existing users of the __GFP_NOFAIL are blockable, so warn
4768 * of any new users that actually require GFP_NOWAIT
4769 */
4770 if (WARN_ON_ONCE(!can_direct_reclaim))
4771 goto fail;
4772
4773 /*
4774 * PF_MEMALLOC request from this context is rather bizarre
4775 * because we cannot reclaim anything and only can loop waiting
4776 * for somebody to do a work for us
4777 */
4778 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4779
4780 /*
4781 * non failing costly orders are a hard requirement which we
4782 * are not prepared for much so let's warn about these users
4783 * so that we can identify them and convert them to something
4784 * else.
4785 */
4786 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4787
4788 /*
4789 * Help non-failing allocations by giving them access to memory
4790 * reserves but do not use ALLOC_NO_WATERMARKS because this
4791 * could deplete whole memory reserves which would just make
4792 * the situation worse
4793 */
4794 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4795 if (page)
4796 goto got_pg;
4797
4798 cond_resched();
4799 goto retry;
4800 }
4801 fail:
4802 warn_alloc(gfp_mask, ac->nodemask,
4803 "page allocation failure: order:%u", order);
4804 got_pg:
4805 return page;
4806 }
4807
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Hi Aaron,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on hnaz-linux-mm/master]
url: https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base: https://github.com/hnaz/linux-mm master
config: riscv-randconfig-r026-20210315 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project a28facba1ccdc957f386b7753f4958307f1bfde8)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install riscv cross compiling tool for clang build
# apt-get install binutils-riscv64-linux-gnu
# https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
git checkout 77338aaff2606a7715c832545e79370e849e3b4e
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
^
In file included from mm/page_alloc.c:20:
In file included from include/linux/highmem.h:10:
In file included from include/linux/hardirq.h:10:
In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:13:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/riscv/include/asm/io.h:148:
include/asm-generic/io.h:572:9: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
return inl(addr);
^~~~~~~~~
arch/riscv/include/asm/io.h:56:76: note: expanded from macro 'inl'
#define inl(c) ({ u32 __v; __io_pbr(); __v = readl_cpu((void*)(PCI_IOBASE + (c))); __io_par(__v); __v; })
~~~~~~~~~~ ^
arch/riscv/include/asm/mmio.h:89:76: note: expanded from macro 'readl_cpu'
#define readl_cpu(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; })
^
include/uapi/linux/byteorder/little_endian.h:34:51: note: expanded from macro '__le32_to_cpu'
#define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
^
In file included from mm/page_alloc.c:20:
In file included from include/linux/highmem.h:10:
In file included from include/linux/hardirq.h:10:
In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:13:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/riscv/include/asm/io.h:148:
include/asm-generic/io.h:580:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
outb(value, addr);
^~~~~~~~~~~~~~~~~
arch/riscv/include/asm/io.h:58:68: note: expanded from macro 'outb'
#define outb(v,c) ({ __io_pbw(); writeb_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
~~~~~~~~~~ ^
arch/riscv/include/asm/mmio.h:91:52: note: expanded from macro 'writeb_cpu'
#define writeb_cpu(v, c) ((void)__raw_writeb((v), (c)))
^
In file included from mm/page_alloc.c:20:
In file included from include/linux/highmem.h:10:
In file included from include/linux/hardirq.h:10:
In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:13:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/riscv/include/asm/io.h:148:
include/asm-generic/io.h:588:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
outw(value, addr);
^~~~~~~~~~~~~~~~~
arch/riscv/include/asm/io.h:59:68: note: expanded from macro 'outw'
#define outw(v,c) ({ __io_pbw(); writew_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
~~~~~~~~~~ ^
arch/riscv/include/asm/mmio.h:92:76: note: expanded from macro 'writew_cpu'
#define writew_cpu(v, c) ((void)__raw_writew((__force u16)cpu_to_le16(v), (c)))
^
In file included from mm/page_alloc.c:20:
In file included from include/linux/highmem.h:10:
In file included from include/linux/hardirq.h:10:
In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:13:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/riscv/include/asm/io.h:148:
include/asm-generic/io.h:596:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
outl(value, addr);
^~~~~~~~~~~~~~~~~
arch/riscv/include/asm/io.h:60:68: note: expanded from macro 'outl'
#define outl(v,c) ({ __io_pbw(); writel_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
~~~~~~~~~~ ^
arch/riscv/include/asm/mmio.h:93:76: note: expanded from macro 'writel_cpu'
#define writel_cpu(v, c) ((void)__raw_writel((__force u32)cpu_to_le32(v), (c)))
^
In file included from mm/page_alloc.c:20:
In file included from include/linux/highmem.h:10:
In file included from include/linux/hardirq.h:10:
In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:13:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/riscv/include/asm/io.h:148:
include/asm-generic/io.h:1017:55: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port;
~~~~~~~~~~ ^
mm/page_alloc.c:2538:5: warning: no previous prototype for function 'find_suitable_fallback' [-Wmissing-prototypes]
int find_suitable_fallback(struct free_area *area, unsigned int order,
^
mm/page_alloc.c:2538:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
int find_suitable_fallback(struct free_area *area, unsigned int order,
^
static
mm/page_alloc.c:4444:3: error: use of undeclared identifier 'result'
result false;
^
mm/page_alloc.c:4447:50: error: expected ';' after return statement
return unreserve_highatomic_pageblock(ac, true)
^
;
mm/page_alloc.c:4507:2: error: expected expression
else
^
>> mm/page_alloc.c:4719:6: error: implicit declaration of function 'should_try_oom' [-Werror,-Wimplicit-function-declaration]
if (should_try_oom(no_progress_loops, compact_result))
^
mm/page_alloc.c:4720:11: error: expected ';' after goto statement
goto oom:
^
;
mm/page_alloc.c:6136:23: warning: no previous prototype for function 'memmap_init' [-Wmissing-prototypes]
void __meminit __weak memmap_init(unsigned long size, int nid,
^
mm/page_alloc.c:6136:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void __meminit __weak memmap_init(unsigned long size, int nid,
^
static
9 warnings and 5 errors generated.
vim +/should_try_oom +4719 mm/page_alloc.c
4544
4545 static inline struct page *
4546 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4547 struct alloc_context *ac)
4548 {
4549 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4550 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4551 struct page *page = NULL;
4552 unsigned int alloc_flags;
4553 unsigned long did_some_progress;
4554 enum compact_priority compact_priority;
4555 enum compact_result compact_result;
4556 int compaction_retries;
4557 int no_progress_loops;
4558 unsigned int cpuset_mems_cookie;
4559 int reserve_flags;
4560
4561 /*
4562 * We also sanity check to catch abuse of atomic reserves being used by
4563 * callers that are not in atomic context.
4564 */
4565 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
4566 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
4567 gfp_mask &= ~__GFP_ATOMIC;
4568
4569 retry_cpuset:
4570 compaction_retries = 0;
4571 no_progress_loops = 0;
4572 compact_priority = DEF_COMPACT_PRIORITY;
4573 cpuset_mems_cookie = read_mems_allowed_begin();
4574
4575 /*
4576 * The fast path uses conservative alloc_flags to succeed only until
4577 * kswapd needs to be woken up, and to avoid the cost of setting up
4578 * alloc_flags precisely. So we do that now.
4579 */
4580 alloc_flags = gfp_to_alloc_flags(gfp_mask);
4581
4582 /*
4583 * We need to recalculate the starting point for the zonelist iterator
4584 * because we might have used different nodemask in the fast path, or
4585 * there was a cpuset modification and we are retrying - otherwise we
4586 * could end up iterating over non-eligible zones endlessly.
4587 */
4588 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4589 ac->highest_zoneidx, ac->nodemask);
4590 if (!ac->preferred_zoneref->zone)
4591 goto nopage;
4592
4593 if (alloc_flags & ALLOC_KSWAPD)
4594 wake_all_kswapds(order, gfp_mask, ac);
4595
4596 /*
4597 * The adjusted alloc_flags might result in immediate success, so try
4598 * that first
4599 */
4600 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4601 if (page)
4602 goto got_pg;
4603
4604 /*
4605 * For costly allocations, try direct compaction first, as it's likely
4606 * that we have enough base pages and don't need to reclaim. For non-
4607 * movable high-order allocations, do that as well, as compaction will
4608 * try prevent permanent fragmentation by migrating from blocks of the
4609 * same migratetype.
4610 * Don't try this for allocations that are allowed to ignore
4611 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4612 */
4613 if (can_direct_reclaim &&
4614 (costly_order ||
4615 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4616 && !gfp_pfmemalloc_allowed(gfp_mask)) {
4617 page = __alloc_pages_direct_compact(gfp_mask, order,
4618 alloc_flags, ac,
4619 INIT_COMPACT_PRIORITY,
4620 &compact_result);
4621 if (page)
4622 goto got_pg;
4623
4624 /*
4625 * Checks for costly allocations with __GFP_NORETRY, which
4626 * includes some THP page fault allocations
4627 */
4628 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4629 /*
4630 * If allocating entire pageblock(s) and compaction
4631 * failed because all zones are below low watermarks
4632 * or is prohibited because it recently failed at this
4633 * order, fail immediately unless the allocator has
4634 * requested compaction and reclaim retry.
4635 *
4636 * Reclaim is
4637 * - potentially very expensive because zones are far
4638 * below their low watermarks or this is part of very
4639 * bursty high order allocations,
4640 * - not guaranteed to help because isolate_freepages()
4641 * may not iterate over freed pages as part of its
4642 * linear scan, and
4643 * - unlikely to make entire pageblocks free on its
4644 * own.
4645 */
4646 if (compact_result == COMPACT_SKIPPED ||
4647 compact_result == COMPACT_DEFERRED)
4648 goto nopage;
4649
4650 /*
4651 * Looks like reclaim/compaction is worth trying, but
4652 * sync compaction could be very expensive, so keep
4653 * using async compaction.
4654 */
4655 compact_priority = INIT_COMPACT_PRIORITY;
4656 }
4657 }
4658
4659 retry:
4660 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4661 if (alloc_flags & ALLOC_KSWAPD)
4662 wake_all_kswapds(order, gfp_mask, ac);
4663
4664 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4665 if (reserve_flags)
4666 alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4667
4668 /*
4669 * Reset the nodemask and zonelist iterators if memory policies can be
4670 * ignored. These allocations are high priority and system rather than
4671 * user oriented.
4672 */
4673 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4674 ac->nodemask = NULL;
4675 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4676 ac->highest_zoneidx, ac->nodemask);
4677 }
4678
4679 /* Attempt with potentially adjusted zonelist and alloc_flags */
4680 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4681 if (page)
4682 goto got_pg;
4683
4684 /* Caller is not willing to reclaim, we can't balance anything */
4685 if (!can_direct_reclaim)
4686 goto nopage;
4687
4688 /* Avoid recursion of direct reclaim */
4689 if (current->flags & PF_MEMALLOC)
4690 goto nopage;
4691
4692 /* Try direct reclaim and then allocating */
4693 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4694 &did_some_progress);
4695 if (page)
4696 goto got_pg;
4697
4698 /* Try direct compaction and then allocating */
4699 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4700 compact_priority, &compact_result);
4701 if (page)
4702 goto got_pg;
4703
4704 /* Do not loop if specifically requested */
4705 if (gfp_mask & __GFP_NORETRY)
4706 goto nopage;
4707
4708 /*
4709 * Do not retry costly high order allocations unless they are
4710 * __GFP_RETRY_MAYFAIL
4711 */
4712 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4713 goto nopage;
4714
4715 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4716 did_some_progress > 0, &no_progress_loops))
4717 goto retry;
4718
> 4719 if (should_try_oom(no_progress_loops, compact_result))
4720 goto oom:
4721 /*
4722 * It doesn't make any sense to retry for the compaction if the order-0
4723 * reclaim is not able to make any progress because the current
4724 * implementation of the compaction depends on the sufficient amount
4725 * of free memory (see __compaction_suitable)
4726 */
4727 if (did_some_progress > 0 &&
4728 should_compact_retry(ac, order, alloc_flags,
4729 compact_result, &compact_priority,
4730 &compaction_retries))
4731 goto retry;
4732
4733
4734 /* Deal with possible cpuset update races before we start OOM killing */
4735 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4736 goto retry_cpuset;
4737
4738 oom:
4739 /* Reclaim has failed us, start killing things */
4740 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4741 if (page)
4742 goto got_pg;
4743
4744 /* Avoid allocations with no watermarks from looping endlessly */
4745 if (tsk_is_oom_victim(current) &&
4746 (alloc_flags & ALLOC_OOM ||
4747 (gfp_mask & __GFP_NOMEMALLOC)))
4748 goto nopage;
4749
4750 /* Retry as long as the OOM killer is making progress */
4751 if (did_some_progress) {
4752 no_progress_loops = 0;
4753 goto retry;
4754 }
4755
4756 nopage:
4757 /* Deal with possible cpuset update races before we fail */
4758 if (check_retry_cpuset(cpuset_mems_cookie, ac))
4759 goto retry_cpuset;
4760
4761 /*
4762 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4763 * we always retry
4764 */
4765 if (gfp_mask & __GFP_NOFAIL) {
4766 /*
4767 * All existing users of the __GFP_NOFAIL are blockable, so warn
4768 * of any new users that actually require GFP_NOWAIT
4769 */
4770 if (WARN_ON_ONCE(!can_direct_reclaim))
4771 goto fail;
4772
4773 /*
4774 * PF_MEMALLOC request from this context is rather bizarre
4775 * because we cannot reclaim anything and only can loop waiting
4776 * for somebody to do a work for us
4777 */
4778 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4779
4780 /*
4781 * non failing costly orders are a hard requirement which we
4782 * are not prepared for much so let's warn about these users
4783 * so that we can identify them and convert them to something
4784 * else.
4785 */
4786 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4787
4788 /*
4789 * Help non-failing allocations by giving them access to memory
4790 * reserves but do not use ALLOC_NO_WATERMARKS because this
4791 * could deplete whole memory reserves which would just make
4792 * the situation worse
4793 */
4794 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4795 if (page)
4796 goto got_pg;
4797
4798 cond_resched();
4799 goto retry;
4800 }
4801 fail:
4802 warn_alloc(gfp_mask, ac->nodemask,
4803 "page allocation failure: order:%u", order);
4804 got_pg:
4805 return page;
4806 }
4807
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Hi Aaron,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on hnaz-linux-mm/master]
url: https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base: https://github.com/hnaz/linux-mm master
config: powerpc64-randconfig-r012-20210315 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project a28facba1ccdc957f386b7753f4958307f1bfde8)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install powerpc64 cross compiling tool for clang build
# apt-get install binutils-powerpc64-linux-gnu
# https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
git checkout 77338aaff2606a7715c832545e79370e849e3b4e
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
mm/page_alloc.c:2538:5: warning: no previous prototype for function 'find_suitable_fallback' [-Wmissing-prototypes]
int find_suitable_fallback(struct free_area *area, unsigned int order,
^
mm/page_alloc.c:2538:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
int find_suitable_fallback(struct free_area *area, unsigned int order,
^
static
>> mm/page_alloc.c:4444:3: error: use of undeclared identifier 'result'
result false;
^
>> mm/page_alloc.c:4447:50: error: expected ';' after return statement
return unreserve_highatomic_pageblock(ac, true)
^
;
>> mm/page_alloc.c:4507:2: error: expected expression
else
^
>> mm/page_alloc.c:4719:6: error: implicit declaration of function 'should_try_oom' [-Werror,-Wimplicit-function-declaration]
if (should_try_oom(no_progress_loops, compact_result))
^
>> mm/page_alloc.c:4720:11: error: expected ';' after goto statement
goto oom:
^
;
mm/page_alloc.c:6136:23: warning: no previous prototype for function 'memmap_init' [-Wmissing-prototypes]
void __meminit __weak memmap_init(unsigned long size, int nid,
^
mm/page_alloc.c:6136:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void __meminit __weak memmap_init(unsigned long size, int nid,
^
static
2 warnings and 5 errors generated.
vim +/result +4444 mm/page_alloc.c
4409
4410 /*
4411 * Checks whether it makes sense to retry the reclaim to make a forward progress
4412 * for the given allocation request.
4413 *
4414 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4415 * without success, or when we couldn't even meet the watermark if we
4416 * reclaimed all remaining pages on the LRU lists.
4417 *
4418 * Returns true if a retry is viable or false to enter the oom path.
4419 */
4420 static inline bool
4421 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4422 struct alloc_context *ac, int alloc_flags,
4423 bool did_some_progress, int *no_progress_loops)
4424 {
4425 struct zone *zone;
4426 struct zoneref *z;
4427 bool ret = false;
4428
4429 /*
4430 * Costly allocations might have made a progress but this doesn't mean
4431 * their order will become available due to high fragmentation so
4432 * always increment the no progress counter for them
4433 */
4434 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4435 *no_progress_loops = 0;
4436 else
4437 (*no_progress_loops)++;
4438
4439 /*
4440 * Make sure we converge to OOM if we cannot make any progress
4441 * several times in the row.
4442 */
4443 if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> 4444 result false;
4445 /* Last chance before OOM, try draining highatomic_reserve once */
4446 else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> 4447 return unreserve_highatomic_pageblock(ac, true)
4448
4449 /*
4450 * Keep reclaiming pages while there is a chance this will lead
4451 * somewhere. If none of the target zones can satisfy our allocation
4452 * request even if all reclaimable pages are considered then we are
4453 * screwed and have to go OOM.
4454 */
4455 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4456 ac->highest_zoneidx, ac->nodemask) {
4457 unsigned long available;
4458 unsigned long reclaimable;
4459 unsigned long min_wmark = min_wmark_pages(zone);
4460 bool wmark;
4461
4462 available = reclaimable = zone_reclaimable_pages(zone);
4463 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
4464
4465 /*
4466 * Would the allocation succeed if we reclaimed all
4467 * reclaimable pages?
4468 */
4469 wmark = __zone_watermark_ok(zone, order, min_wmark,
4470 ac->highest_zoneidx, alloc_flags, available);
4471 trace_reclaim_retry_zone(z, order, reclaimable,
4472 available, min_wmark, *no_progress_loops, wmark);
4473 if (wmark) {
4474 /*
4475 * If we didn't make any progress and have a lot of
4476 * dirty + writeback pages then we should wait for
4477 * an IO to complete to slow down the reclaim and
4478 * prevent from pre mature OOM
4479 */
4480 if (!did_some_progress) {
4481 unsigned long write_pending;
4482
4483 write_pending = zone_page_state_snapshot(zone,
4484 NR_ZONE_WRITE_PENDING);
4485
4486 if (2 * write_pending > reclaimable) {
4487 congestion_wait(BLK_RW_ASYNC, HZ/10);
4488 return true;
4489 }
4490 }
4491
4492 ret = true;
4493 goto out;
4494 }
4495 }
4496
4497 out:
4498 /*
4499 * Memory allocation/reclaim might be called from a WQ context and the
4500 * current implementation of the WQ concurrency control doesn't
4501 * recognize that a particular WQ is congested if the worker thread is
4502 * looping without ever sleeping. Therefore we have to do a short sleep
4503 * here rather than calling cond_resched().
4504 */
4505 if (current->flags & PF_WQ_WORKER)
4506 schedule_timeout_uninterruptible(1);
> 4507 else
4508 cond_resched();
4509 return ret;
4510 }
4511
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> In the situation where direct reclaim is required to make progress for
> compaction but no_progress_loops is already over the limit of
> MAX_RECLAIM_RETRIES consider invoking the oom killer.
What is the problem you are trying to fix?
>
> Signed-off-by: Aaron Tomlin <[email protected]>
> ---
> mm/page_alloc.c | 22 ++++++++++++++++++----
> 1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7a2c89b21115..8d748b1b8d1e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4181,6 +4181,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> return NULL;
> }
>
> +static inline bool
> +should_try_oom(int no_progress_loops,
> + enum compact_result last_compact_result)
> +{
> + if (no_progress_loops > MAX_RECLAIM_RETRIES && last_compact_result
> + == COMPACT_SKIPPED)
> + return true;
> + return false;
> +}
> +
> static inline bool
> should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
> enum compact_result compact_result,
> @@ -4547,10 +4557,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
> * Make sure we converge to OOM if we cannot make any progress
> * several times in the row.
> */
> - if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
> - /* Before OOM, exhaust highatomic_reserve */
> - return unreserve_highatomic_pageblock(ac, true);
> - }
> + if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> + result false;
> + /* Last chance before OOM, try draining highatomic_reserve once */
> + else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> + return unreserve_highatomic_pageblock(ac, true)
>
> /*
> * Keep reclaiming pages while there is a chance this will lead
> @@ -4822,6 +4833,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> did_some_progress > 0, &no_progress_loops))
> goto retry;
>
> + if (should_try_oom(no_progress_loops, compact_result))
> + goto oom:
> /*
> * It doesn't make any sense to retry for the compaction if the order-0
> * reclaim is not able to make any progress because the current
> @@ -4839,6 +4852,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> if (check_retry_cpuset(cpuset_mems_cookie, ac))
> goto retry_cpuset;
>
> +oom:
> /* Reclaim has failed us, start killing things */
> page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
> if (page)
> --
> 2.26.2
>
--
Michal Hocko
SUSE Labs
Hi Michal,
On Thu 2021-03-18 17:16 +0100, Michal Hocko wrote:
> On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> > In the situation where direct reclaim is required to make progress for
> > compaction but no_progress_loops is already over the limit of
> > MAX_RECLAIM_RETRIES consider invoking the oom killer.
Firstly, thank you for your response.
> What is the problem you are trying to fix?
If I understand correctly, in the case of a "costly" order allocation
request that is permitted to repeatedly retry, it is possible to exceed the
maximum reclaim retry threshold as long as "some" progress is being made
even at the highest compaction priority. Furthermore, if the allocator has
a fatal signal pending, this is not considered.
In my opinion, it might be better to just give up straight away or try and
use the OOM killer only in the non-costly order allocation scenario to
assit reclaim. Looking at __alloc_pages_may_oom() the current logic is to
entirely skip the OOM killer for a costly order request, which makes sense.
Regards,
--
Aaron Tomlin
On Fri 19-03-21 17:29:01, Aaron Tomlin wrote:
> Hi Michal,
>
> On Thu 2021-03-18 17:16 +0100, Michal Hocko wrote:
> > On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> > > In the situation where direct reclaim is required to make progress for
> > > compaction but no_progress_loops is already over the limit of
> > > MAX_RECLAIM_RETRIES consider invoking the oom killer.
>
> Firstly, thank you for your response.
>
> > What is the problem you are trying to fix?
>
> If I understand correctly, in the case of a "costly" order allocation
> request that is permitted to repeatedly retry, it is possible to exceed the
> maximum reclaim retry threshold as long as "some" progress is being made
> even at the highest compaction priority.
Costly orders already do have heuristics for the retry in place. Could
you be more specific what kind of problem you see with those?
> Furthermore, if the allocator has a fatal signal pending, this is not
> considered.
Fatal signals pending are usually not a strong reason to cut retries
count or fail allocations.
> In my opinion, it might be better to just give up straight away or try and
> use the OOM killer only in the non-costly order allocation scenario to
> assit reclaim. Looking at __alloc_pages_may_oom() the current logic is to
> entirely skip the OOM killer for a costly order request, which makes sense.
Well, opinions might differ of course. The main question is whether
there are workloads which are unhappy about the existing behavior.
--
Michal Hocko
SUSE Labs
On Mon 2021-03-22 11:47 +0100, Michal Hocko wrote:
> Costly orders already do have heuristics for the retry in place. Could
> you be more specific what kind of problem you see with those?
If I understand correctly, when the gfp_mask consists of
GFP_KERNEL | __GFP_RETRY_MAYFAIL in particular, an allocation request will
fail, if and only if reclaim is unable to make progress.
The costly order allocation retry logic is handled primarily in
should_reclaim_retry(). Looking at should_reclaim_retry() we see that the
no progress counter value is always incremented in the costly order case
even when "some" progress has been made which is represented by the value
stored in did_some_progress.
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
I think after we have tried MAX_RECLAIM_RETRIES in a row without success
and the last known compaction attempt was "skipped", perhaps it would be
better to try and use the OOM killer or fail the allocation attempt?
I encountered a situation when the value of no_progress_loops was found to
be 31,611,688 i.e. significantly over MAX_RECLAIM_RETRIES; the allocation
order was 5. The gfp_mask contained the following:
#define ___GFP_HIGHMEM 0x02
#define ___GFP_IO 0x40
#define ___GFP_FS 0x80
#define ___GFP_NOWARN 0x200
#define ___GFP_RETRY_MAYFAIL 0x400
#define ___GFP_COMP 0x4000
#define ___GFP_HARDWALL 0x20000
#define ___GFP_DIRECT_RECLAIM 0x200000
#define ___GFP_KSWAPD_RECLAIM 0x400000
--
Aaron Tomlin
[Cc Vlastimil]
On Thu 25-03-21 21:01:59, Aaron Tomlin wrote:
> On Mon 2021-03-22 11:47 +0100, Michal Hocko wrote:
> > Costly orders already do have heuristics for the retry in place. Could
> > you be more specific what kind of problem you see with those?
>
> If I understand correctly, when the gfp_mask consists of
> GFP_KERNEL | __GFP_RETRY_MAYFAIL in particular, an allocation request will
> fail, if and only if reclaim is unable to make progress.
>
> The costly order allocation retry logic is handled primarily in
> should_reclaim_retry(). Looking at should_reclaim_retry() we see that the
> no progress counter value is always incremented in the costly order case
> even when "some" progress has been made which is represented by the value
> stored in did_some_progress.
>
> if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
> goto nopage;
>
> if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
> did_some_progress > 0, &no_progress_loops))
> goto retry;
>
> I think after we have tried MAX_RECLAIM_RETRIES in a row without success
> and the last known compaction attempt was "skipped", perhaps it would be
> better to try and use the OOM killer or fail the allocation attempt?
The oom killer is never triggered for costly allocation request. Both
reclaim and compaction maintain their own retries counters as they are
targeting a different operation. Although the compaction really depends
on the reclaim to do some progress.
> I encountered a situation when the value of no_progress_loops was found to
> be 31,611,688 i.e. significantly over MAX_RECLAIM_RETRIES; the allocation
> order was 5. The gfp_mask contained the following:
OK, this sound unexpected as it indicates that the reclaim is able to
make a forward progress but compaction doesn't want to give up and keeps
retrying. Are you able to reproduce this or could you find out which
specific condition keeps compaction retrying? I would expect that it is
one of the 3 conditions before the max_retries is checked.
> #define ___GFP_HIGHMEM 0x02
> #define ___GFP_IO 0x40
> #define ___GFP_FS 0x80
> #define ___GFP_NOWARN 0x200
> #define ___GFP_RETRY_MAYFAIL 0x400
> #define ___GFP_COMP 0x4000
> #define ___GFP_HARDWALL 0x20000
> #define ___GFP_DIRECT_RECLAIM 0x200000
> #define ___GFP_KSWAPD_RECLAIM 0x400000
>
>
>
> --
> Aaron Tomlin
>
--
Michal Hocko
SUSE Labs
Hi Michal,
On Fri 2021-03-26 09:16 +0100, Michal Hocko wrote:
> The oom killer is never triggered for costly allocation request.
Yes - I agree. Looking at __alloc_pages_may_oom() I can see for a costly
order allocation request the OOM killer is explicitly not used.
If I understand correctly, the patch I proposed was for the following
scenarios:
1. The costly order allocation request to fail when
"some" progress is made (i.e. order-0) and the last
compaction request was "skipped"
2. A non-costly order allocation request that is
unable to make any progress and failed over the
maximum reclaim retry count in a row and the last
known compaction result was skipped to consider
using the OOM killer for assistance
> Both reclaim and compaction maintain their own retries counters as they
> are targeting a different operation. Although the compaction really
> depends on the reclaim to do some progress.
Yes. Looking at should_compact_retry() if the last known compaction result
was skipped i.e. suggesting there was not enough order-0 pages to support
compaction, so assistance is needed from reclaim
(see __compaction_suitable()).
I noticed that the value of compaction_retries, compact_result and
compact_priority was 0, COMPACT_SKIPPED and 1 i.e. COMPACT_PRIO_SYNC_LIGHT,
respectively.
> OK, this sound unexpected as it indicates that the reclaim is able to
> make a forward progress but compaction doesn't want to give up and keeps
> retrying. Are you able to reproduce this or could you find out which
> specific condition keeps compaction retrying? I would expect that it is
> one of the 3 conditions before the max_retries is checked.
Unfortunately, I have been told it is not entirely reproducible.
I suspect it is the following in should_compact_retry() - as I indicated
above the last known value stored in compaction_retries was 0:
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
if (*compaction_retries <= max_retries) {
ret = true;
goto out;
}
Kind regards,
--
Aaron Tomlin
On Fri 26-03-21 11:22:54, Aaron Tomlin wrote:
[...]
> > Both reclaim and compaction maintain their own retries counters as they
> > are targeting a different operation. Although the compaction really
> > depends on the reclaim to do some progress.
>
> Yes. Looking at should_compact_retry() if the last known compaction result
> was skipped i.e. suggesting there was not enough order-0 pages to support
> compaction, so assistance is needed from reclaim
> (see __compaction_suitable()).
>
> I noticed that the value of compaction_retries, compact_result and
> compact_priority was 0, COMPACT_SKIPPED and 1 i.e. COMPACT_PRIO_SYNC_LIGHT,
> respectively.
>
> > OK, this sound unexpected as it indicates that the reclaim is able to
> > make a forward progress but compaction doesn't want to give up and keeps
> > retrying. Are you able to reproduce this or could you find out which
> > specific condition keeps compaction retrying? I would expect that it is
> > one of the 3 conditions before the max_retries is checked.
>
> Unfortunately, I have been told it is not entirely reproducible.
> I suspect it is the following in should_compact_retry() - as I indicated
> above the last known value stored in compaction_retries was 0:
>
>
> if (order > PAGE_ALLOC_COSTLY_ORDER)
> max_retries /= 4;
> if (*compaction_retries <= max_retries) {
> ret = true;
> goto out;
> }
OK, I kinda expected this would be not easily reproducible. The reason I
dislike your patch is that it addes yet another criterion for oom while
we already do have 2 which doesn't make the resulting code easier to
reason about. We should be focusing on the compaction retry logic and
see whether we can have some "run away" scenarios there. Seeing so many
retries without compaction bailing out sounds like a bug in that retry
logic. Vlastimil is much more familiar with that.
--
Michal Hocko
SUSE Labs
On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> We should be focusing on the compaction retry logic and see whether we
> can have some "run away" scenarios there.
Fair enough.
> Seeing so many retries without compaction bailing out sounds like a bug
> in that retry logic.
Agreed - I will continue to look into this.
Kind regards,
--
Aaron Tomlin
Michal,
On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> OK, I kinda expected this would be not easily reproducible.
Unfortunately, I'm still waiting for feedback on this.
> We should be focusing on the compaction retry logic and see whether we
> can have some "run away" scenarios there. Seeing so many retries without
> compaction bailing out sounds like a bug in that retry logic.
I suspect so.
This is indeed a case of excessive reclaim/compaction retries (i.e. the
last known value stored in the no_progress_loops variable was 31,611,688).
What might be particularly unique about this situation is that a fatal
signal was found pending. In this context, if I understand correctly, it
does not make sense to retry compaction when the last known compact result
was skipped and a fatal signal is pending.
Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
albeit, not every zone, on the zone list, would be considered in the case
a fatal signal is found to be pending. Yet, in should_compact_retry(),
given the last known compaction result, each zone, on the zone list, can be
considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
was found to succeed then reclaim/compaction would be tried again
(notwithstanding the above).
--
Aaron Tomlin
On Tue 18-05-21 15:05:54, Aaron Tomlin wrote:
> Michal,
>
> On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> > OK, I kinda expected this would be not easily reproducible.
>
> Unfortunately, I'm still waiting for feedback on this.
>
> > We should be focusing on the compaction retry logic and see whether we
> > can have some "run away" scenarios there. Seeing so many retries without
> > compaction bailing out sounds like a bug in that retry logic.
>
> I suspect so.
>
> This is indeed a case of excessive reclaim/compaction retries (i.e. the
> last known value stored in the no_progress_loops variable was 31,611,688).
>
> What might be particularly unique about this situation is that a fatal
> signal was found pending. In this context, if I understand correctly, it
> does not make sense to retry compaction when the last known compact result
> was skipped and a fatal signal is pending.
OK, this might be an interesting lead.
> Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
> albeit, not every zone, on the zone list, would be considered in the case
> a fatal signal is found to be pending. Yet, in should_compact_retry(),
> given the last known compaction result, each zone, on the zone list, can be
> considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
> was found to succeed then reclaim/compaction would be tried again
> (notwithstanding the above).
I believe Vlastimil would be much better fit into looking into those
details but it smells like pending fatal signals can lead to a unbound
retry indeed.
Thanks!
--
Michal Hocko
SUSE Labs
Michal,
On Wed 2021-05-19 13:10 +0200, Michal Hocko wrote:
> > Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
> > albeit, not every zone, on the zone list, would be considered in the case
> > a fatal signal is found to be pending. Yet, in should_compact_retry(),
> > given the last known compaction result, each zone, on the zone list, can be
> > considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
> > was found to succeed then reclaim/compaction would be tried again
> > (notwithstanding the above).
>
> I believe Vlastimil would be much better fit into looking into those
> details but it smells like pending fatal signals can lead to a unbound
> retry indeed.
Understood.
I will post a trivial patch to hopefully address this particular condition
in the compaction retry code path, shortly for further discussion.
Kind regards,
--
Aaron Tomlin