The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.
However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.
This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.
This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.
With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.
v6: make khugepaged actually obey tmpfs mount flags
v5: reduce gfp mask further if needed, to accomodate i915 (Matthew Wilcox)
v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
v3: fix NULL vma issue spotted by Hugh Dickins & tested
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu
Matthew Wilcox pointed out that the i915 driver opportunistically
allocates tmpfs memory, but will happily reclaim some of its
pool if no memory is available.
Make sure the gfp mask used to opportunistically allocate a THP
is always at least as restrictive as the original gfp mask.
Signed-off-by: Rik van Riel <[email protected]>
Suggested-by: Matthew Wilcox <[email protected]>
---
mm/shmem.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c
index 6c3cb192a88d..ee3cea10c2a4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1531,6 +1531,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
return page;
}
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t result = huge_gfp & ~allowflags;
+
+ /*
+ * Minimize the result gfp by taking the union with the deny flags,
+ * and the intersection of the allow flags.
+ */
+ result |= (limit_gfp & denyflags);
+ result |= (huge_gfp & limit_gfp) & allowflags;
+
+ return result;
+}
+
static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
@@ -1889,6 +1909,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
alloc_huge:
huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
alloc_nohuge:
--
2.25.4
Currently if thp enabled=[madvise], mounting a tmpfs filesystem
with huge=always and mmapping files from that tmpfs does not
result in khugepaged collapsing those mappings, despite the
mount flag indicating that it should.
Fix that by breaking up the blocks of tests in hugepage_vma_check
a little bit, and testing things in the correct order.
Signed-off-by: Rik van Riel <[email protected]>
Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem")
---
include/linux/khugepaged.h | 2 ++
mm/khugepaged.c | 22 ++++++++++++++++------
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index c941b7377321..2fcc01891b47 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -3,6 +3,7 @@
#define _LINUX_KHUGEPAGED_H
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+#include <linux/shmem_fs.h>
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
if ((khugepaged_always() ||
+ (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
(khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
!(vm_flags & VM_NOHUGEPAGE) &&
!test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4e3dff13eb70..abab394c4206 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -440,18 +440,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vm_flags & VM_NOHUGEPAGE) ||
+ /* Explicitly disabled through madvise. */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file) ||
- (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- vma->vm_file &&
- (vm_flags & VM_DENYWRITE))) {
+ /* Enabled via shmem mount options or sysfs settings. */
+ if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ return false;
+
+ /* Read-only file mappings need to be aligned for THP to work. */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ (vm_flags & VM_DENYWRITE)) {
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+
if (!vma->anon_vma || vma->vm_ops)
return false;
if (vma_is_temporary_stack(vma))
--
2.25.4
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.
However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.
This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.
Controlling the gfp_mask of THP allocations through the knobs in
sysfs allows users to determine the balance between how aggressively
the system tries to allocate THPs at fault time, and how much the
application may end up stalling attempting those allocations.
This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.
With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.
Signed-off-by: Rik van Riel <[email protected]>
---
include/linux/gfp.h | 2 ++
mm/huge_memory.c | 6 +++---
mm/shmem.c | 8 +++++---
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..c7615c9ba03c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
#ifdef CONFIG_PM_SLEEP
extern bool pm_suspended_storage(void);
#else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..c5d03b2f2f2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
- const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
@@ -744,7 +744,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma);
+ gfp = vma_thp_gfp_mask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..6c3cb192a88d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
+ true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+ gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
}
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ huge_gfp = vma_thp_gfp_mask(vma);
+ page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,
--
2.25.4
On Tue 24-11-20 14:49:24, Rik van Riel wrote:
> Matthew Wilcox pointed out that the i915 driver opportunistically
> allocates tmpfs memory, but will happily reclaim some of its
> pool if no memory is available.
>
> Make sure the gfp mask used to opportunistically allocate a THP
> is always at least as restrictive as the original gfp mask.
I have brought this up in the previous version review and I feel my
feedback hasn't been addressed. Please describe the expected behavior by
those shmem users including GFP_KERNEL restriction which would make the
THP flags incompatible. Is this a problem? Is there any actual problem
if the THP uses its own set of flags?
I am also not happy how those two sets of flags are completely detached
and we can only expect surprises there.
> Signed-off-by: Rik van Riel <[email protected]>
> Suggested-by: Matthew Wilcox <[email protected]>
> ---
> mm/shmem.c | 21 +++++++++++++++++++++
> 1 file changed, 21 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 6c3cb192a88d..ee3cea10c2a4 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1531,6 +1531,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
> return page;
> }
>
> +/*
> + * Make sure huge_gfp is always more limited than limit_gfp.
> + * Some of the flags set permissions, while others set limitations.
> + */
> +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
> +{
> + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
> + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
> + gfp_t result = huge_gfp & ~allowflags;
> +
> + /*
> + * Minimize the result gfp by taking the union with the deny flags,
> + * and the intersection of the allow flags.
> + */
> + result |= (limit_gfp & denyflags);
> + result |= (huge_gfp & limit_gfp) & allowflags;
> +
> + return result;
> +}
> +
> static struct page *shmem_alloc_hugepage(gfp_t gfp,
> struct shmem_inode_info *info, pgoff_t index)
> {
> @@ -1889,6 +1909,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>
> alloc_huge:
> huge_gfp = vma_thp_gfp_mask(vma);
> + huge_gfp = limit_gfp_mask(huge_gfp, gfp);
> page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
> if (IS_ERR(page)) {
> alloc_nohuge:
> --
> 2.25.4
>
--
Michal Hocko
SUSE Labs
On 11/24/20 8:49 PM, Rik van Riel wrote:
> The allocation flags of anonymous transparent huge pages can be controlled
> through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
> help the system from getting bogged down in the page reclaim and compaction
> code when many THPs are getting allocated simultaneously.
>
> However, the gfp_mask for shmem THP allocations were not limited by those
> configuration settings, and some workloads ended up with all CPUs stuck
> on the LRU lock in the page reclaim code, trying to allocate dozens of
> THPs simultaneously.
>
> This patch applies the same configurated limitation of THPs to shmem
> hugepage allocations, to prevent that from happening.
>
> Controlling the gfp_mask of THP allocations through the knobs in
> sysfs allows users to determine the balance between how aggressively
> the system tries to allocate THPs at fault time, and how much the
> application may end up stalling attempting those allocations.
>
> This way a THP defrag setting of "never" or "defer+madvise" will result
> in quick allocation failures without direct reclaim when no 2MB free
> pages are available.
>
> With this patch applied, THP allocations for tmpfs will be a little
> more aggressive than today for files mmapped with MADV_HUGEPAGE,
> and a little less aggressive for files that are not mmapped or
> mapped without that flag.
>
> Signed-off-by: Rik van Riel <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
On 11/24/20 8:49 PM, Rik van Riel wrote:
> Currently if thp enabled=[madvise], mounting a tmpfs filesystem
> with huge=always and mmapping files from that tmpfs does not
> result in khugepaged collapsing those mappings, despite the
> mount flag indicating that it should.
>
> Fix that by breaking up the blocks of tests in hugepage_vma_check
> a little bit, and testing things in the correct order.
>
> Signed-off-by: Rik van Riel <[email protected]>
> Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem")
Looks ok. But, it we have sysfs thp enabled=never, and shmem mount explicitly
thp enabled, then shmem mount overrides the global sysfs setting and thp's will
be allocated there, right? However, khugepaged_enabled() will be false and thus
khugepaged won't run at all? So a similar situation than what you're fixing here.
> ---
> include/linux/khugepaged.h | 2 ++
> mm/khugepaged.c | 22 ++++++++++++++++------
> 2 files changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index c941b7377321..2fcc01891b47 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -3,6 +3,7 @@
> #define _LINUX_KHUGEPAGED_H
>
> #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
> +#include <linux/shmem_fs.h>
>
>
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
> {
> if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
> if ((khugepaged_always() ||
> + (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
> (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
> !(vm_flags & VM_NOHUGEPAGE) &&
> !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4e3dff13eb70..abab394c4206 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -440,18 +440,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
> static bool hugepage_vma_check(struct vm_area_struct *vma,
> unsigned long vm_flags)
> {
> - if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
> - (vm_flags & VM_NOHUGEPAGE) ||
> + /* Explicitly disabled through madvise. */
> + if ((vm_flags & VM_NOHUGEPAGE) ||
> test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
> return false;
>
> - if (shmem_file(vma->vm_file) ||
> - (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
> - vma->vm_file &&
> - (vm_flags & VM_DENYWRITE))) {
> + /* Enabled via shmem mount options or sysfs settings. */
> + if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
> return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
> HPAGE_PMD_NR);
> }
> +
> + /* THP settings require madvise. */
> + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
> + return false;
> +
> + /* Read-only file mappings need to be aligned for THP to work. */
> + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
> + (vm_flags & VM_DENYWRITE)) {
> + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
> + HPAGE_PMD_NR);
> + }
> +
> if (!vma->anon_vma || vma->vm_ops)
> return false;
> if (vma_is_temporary_stack(vma))
>
On Thu, 2020-11-26 at 14:40 +0100, Michal Hocko wrote:
> On Tue 24-11-20 14:49:24, Rik van Riel wrote:
> > Matthew Wilcox pointed out that the i915 driver opportunistically
> > allocates tmpfs memory, but will happily reclaim some of its
> > pool if no memory is available.
> >
> > Make sure the gfp mask used to opportunistically allocate a THP
> > is always at least as restrictive as the original gfp mask.
>
> I have brought this up in the previous version review and I feel my
> feedback hasn't been addressed. Please describe the expected behavior
> by
> those shmem users including GFP_KERNEL restriction which would make
> the
> THP flags incompatible. Is this a problem? Is there any actual
> problem
> if the THP uses its own set of flags?
In the case of i915, the gfp flags passed in by the i915
driver expect the VM to easily fail the allocation, in
which case the i915 driver will reclaim some existing
buffers and try again.
Trying harder than the original gfp_mask would
change
the OOM behavior of systems using the i915 driver.
> I am also not happy how those two sets of flags are completely
> detached
> and we can only expect surprises there.
I would be more than happy to implement things differently,
but I am not sure what alternative you are suggesting.
--
All Rights Reversed.
On Thu, 2020-11-26 at 18:18 +0100, Vlastimil Babka wrote:
> On 11/24/20 8:49 PM, Rik van Riel wrote:
> > Currently if thp enabled=[madvise], mounting a tmpfs filesystem
> > with huge=always and mmapping files from that tmpfs does not
> > result in khugepaged collapsing those mappings, despite the
> > mount flag indicating that it should.
> >
> > Fix that by breaking up the blocks of tests in hugepage_vma_check
> > a little bit, and testing things in the correct order.
> >
> > Signed-off-by: Rik van Riel <[email protected]>
> > Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when
> > merging vma for shmem")
>
> Looks ok. But, it we have sysfs thp enabled=never, and shmem mount
> explicitly
> thp enabled, then shmem mount overrides the global sysfs setting and
> thp's will
> be allocated there, right? However, khugepaged_enabled() will be
> false and thus
> khugepaged won't run at all? So a similar situation than what you're
> fixing here.
Indeed, that is somewhat similar. Whether or not shmem
allocations attempt huge pages is controlled by both
the file /sys/kernel/mm/transparent_hugepage/shmem_enabled
and mount options.
This patch makes khugepaged treat the mount options
and/or
sysfs flag as enabling collapsing of huge pages in case
enabled = [always] for regular THPs.
Should I send another patch on top
of this that causes
khugepaged to be enabled when regular THPs are disabled,
but shmem THPs are enabled in any way?
--
All Rights Reversed.
On 11/26/20 7:14 PM, Rik van Riel wrote:
> On Thu, 2020-11-26 at 18:18 +0100, Vlastimil Babka wrote:
>> On 11/24/20 8:49 PM, Rik van Riel wrote:
>>> Currently if thp enabled=[madvise], mounting a tmpfs filesystem
>>> with huge=always and mmapping files from that tmpfs does not
>>> result in khugepaged collapsing those mappings, despite the
>>> mount flag indicating that it should.
>>>
>>> Fix that by breaking up the blocks of tests in hugepage_vma_check
>>> a little bit, and testing things in the correct order.
>>>
>>> Signed-off-by: Rik van Riel <[email protected]>
>>> Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when
>>> merging vma for shmem")
>>
>> Looks ok. But, it we have sysfs thp enabled=never, and shmem mount
>> explicitly
>> thp enabled, then shmem mount overrides the global sysfs setting and
>> thp's will
>> be allocated there, right? However, khugepaged_enabled() will be
>> false and thus
>> khugepaged won't run at all? So a similar situation than what you're
>> fixing here.
>
> Indeed, that is somewhat similar. Whether or not shmem
> allocations attempt huge pages is controlled by both
> the file /sys/kernel/mm/transparent_hugepage/shmem_enabled
Ah right, there's also that sysfs file.
> and mount options.
>
> This patch makes khugepaged treat the mount options
> and/or
> sysfs flag as enabling collapsing of huge pages in case
> enabled = [always] for regular THPs.
>
> Should I send another patch on top
> of this that causes
> khugepaged to be enabled when regular THPs are disabled,
> but shmem THPs are enabled in any way?
I think it would make sense. Although it might involve counting
thp-enabled shmem mounts and only run khugepaged when there are >0 of them.
On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> On Thu, 2020-11-26 at 14:40 +0100, Michal Hocko wrote:
> > On Tue 24-11-20 14:49:24, Rik van Riel wrote:
> > > Matthew Wilcox pointed out that the i915 driver opportunistically
> > > allocates tmpfs memory, but will happily reclaim some of its
> > > pool if no memory is available.
> > >
> > > Make sure the gfp mask used to opportunistically allocate a THP
> > > is always at least as restrictive as the original gfp mask.
> >
> > I have brought this up in the previous version review and I feel my
> > feedback hasn't been addressed. Please describe the expected behavior
> > by
> > those shmem users including GFP_KERNEL restriction which would make
> > the
> > THP flags incompatible. Is this a problem? Is there any actual
> > problem
> > if the THP uses its own set of flags?
>
> In the case of i915, the gfp flags passed in by the i915
> driver expect the VM to easily fail the allocation, in
> which case the i915 driver will reclaim some existing
> buffers and try again.
The existing code tries hard to prevent from the oom killer AFAIU.
At least that is what i915_gem_object_get_pages_gtt says. And that is
ok for order-0 (or low order) requests. But THPs are costly orders and
therefore __GFP_NORETRY has a different meaning. It still controls how
hard to try compact but this is not a OOM control. ttm_tt_swapout is
similar except it chosen to try harder for order-0 cases but still want
to prevent the oom killer.
> Trying harder than the original gfp_mask would change the OOM behavior
> of systems using the i915 driver.
>
> > I am also not happy how those two sets of flags are completely
> > detached
> > and we can only expect surprises there.
>
> I would be more than happy to implement things differently,
> but I am not sure what alternative you are suggesting.
Simply do not alter gfp flags? Or warn in some cases of a serious mismatch.
E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL users of
shmem.
--
Michal Hocko
SUSE Labs
On Tue 24-11-20 14:49:23, Rik van Riel wrote:
> The allocation flags of anonymous transparent huge pages can be controlled
> through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
> help the system from getting bogged down in the page reclaim and compaction
> code when many THPs are getting allocated simultaneously.
>
> However, the gfp_mask for shmem THP allocations were not limited by those
> configuration settings, and some workloads ended up with all CPUs stuck
> on the LRU lock in the page reclaim code, trying to allocate dozens of
> THPs simultaneously.
>
> This patch applies the same configurated limitation of THPs to shmem
> hugepage allocations, to prevent that from happening.
>
> Controlling the gfp_mask of THP allocations through the knobs in
> sysfs allows users to determine the balance between how aggressively
> the system tries to allocate THPs at fault time, and how much the
> application may end up stalling attempting those allocations.
>
> This way a THP defrag setting of "never" or "defer+madvise" will result
> in quick allocation failures without direct reclaim when no 2MB free
> pages are available.
>
> With this patch applied, THP allocations for tmpfs will be a little
> more aggressive than today for files mmapped with MADV_HUGEPAGE,
> and a little less aggressive for files that are not mmapped or
> mapped without that flag.
As already said I am not against this unification. On the other I really
hope that we will not hear about somebody really requesting a per mount
control over this behavior because some might be benefiting from THPs
more than others and the initial cost would pay off. This is not
something we do care about now so this patch wouldn't regress in that
aspect.
> Signed-off-by: Rik van Riel <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Btw. Documentation/admin-guide/mm/transhuge.rst needs some update as
well. What about the following?
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index b2acd0d395ca..41fe84c5b808 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -104,7 +104,7 @@ regions (to avoid the risk of consuming more memory resources) or enabled
echo never >/sys/kernel/mm/transparent_hugepage/enabled
It's also possible to limit defrag efforts in the VM to generate
-anonymous hugepages in case they're not immediately free to madvise
+anonymous and shmem hugepages in case they're not immediately free to madvise
regions or to never try to defrag memory and simply fallback to regular
pages unless hugepages are immediately available. Clearly if we spend CPU
time to defrag memory, we would expect to gain even more by the fact we
> ---
> include/linux/gfp.h | 2 ++
> mm/huge_memory.c | 6 +++---
> mm/shmem.c | 8 +++++---
> 3 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index c603237e006c..c7615c9ba03c 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
> extern void pm_restrict_gfp_mask(void);
> extern void pm_restore_gfp_mask(void);
>
> +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
> +
> #ifdef CONFIG_PM_SLEEP
> extern bool pm_suspended_storage(void);
> #else
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9474dbc150ed..c5d03b2f2f2f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> * available
> * never: never stall for any thp allocation
> */
> -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
> +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
> {
> - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
> + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
>
> /* Always do synchronous compaction */
> if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
> @@ -744,7 +744,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> pte_free(vma->vm_mm, pgtable);
> return ret;
> }
> - gfp = alloc_hugepage_direct_gfpmask(vma);
> + gfp = vma_thp_gfp_mask(vma);
> page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
> if (unlikely(!page)) {
> count_vm_event(THP_FAULT_FALLBACK);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 537c137698f8..6c3cb192a88d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
> return NULL;
>
> shmem_pseudo_vma_init(&pvma, info, hindex);
> - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
> - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
> + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
> + true);
> shmem_pseudo_vma_destroy(&pvma);
> if (page)
> prep_transhuge_page(page);
> @@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> struct page *page;
> enum sgp_type sgp_huge = sgp;
> pgoff_t hindex = index;
> + gfp_t huge_gfp;
> int error;
> int once = 0;
> int alloced = 0;
> @@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> }
>
> alloc_huge:
> - page = shmem_alloc_and_acct_page(gfp, inode, index, true);
> + huge_gfp = vma_thp_gfp_mask(vma);
> + page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
> if (IS_ERR(page)) {
> alloc_nohuge:
> page = shmem_alloc_and_acct_page(gfp, inode,
> --
> 2.25.4
--
Michal Hocko
SUSE Labs
On Thu, 2020-11-26 at 20:42 +0100, Vlastimil Babka wrote:
> On 11/26/20 7:14 PM, Rik van Riel wrote:
> > On Thu, 2020-11-26 at 18:18 +0100, Vlastimil Babka wrote:
> > >
> > This patch makes khugepaged treat the mount options
> > and/or
> > sysfs flag as enabling collapsing of huge pages in case
> > enabled = [always] for regular THPs.
> >
> > Should I send another patch on top
> > of this that causes
> > khugepaged to be enabled when regular THPs are disabled,
> > but shmem THPs are enabled in any way?
>
> I think it would make sense. Although it might involve counting
> thp-enabled shmem mounts and only run khugepaged when there are >0 of
> them.
That seems feasible. I can do that as a follow-up 4/3
patch after the Thanksgiving weekend :)
--
All Rights Reversed.
On Fri, 2020-11-27 at 08:52 +0100, Michal Hocko wrote:
> On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> >
> > I would be more than happy to implement things differently,
> > but I am not sure what alternative you are suggesting.
>
> Simply do not alter gfp flags? Or warn in some cases of a serious
> mismatch.
> E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL users
> of
> shmem.
Not altering the gfp flags is not really an option,
because that would leads to attempting to allocate THPs
with GFP_HIGHUSER, which is what is used to allocate
regular tmpfs pages.
If the THP configuration in sysfs says we should
not
be doing compaction/reclaim from THP allocations, we
should obey that configuration setting, and use a
gfp_flags that results in no compaction/reclaim being done.
--
All Rights Reversed.
On Fri 27-11-20 14:03:39, Rik van Riel wrote:
> On Fri, 2020-11-27 at 08:52 +0100, Michal Hocko wrote:
> > On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> > >
> > > I would be more than happy to implement things differently,
> > > but I am not sure what alternative you are suggesting.
> >
> > Simply do not alter gfp flags? Or warn in some cases of a serious
> > mismatch.
> > E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL users
> > of
> > shmem.
>
> Not altering the gfp flags is not really an option,
> because that would leads to attempting to allocate THPs
> with GFP_HIGHUSER, which is what is used to allocate
> regular tmpfs pages.
Right but that is a completely different reason to alter the mask and it
would be really great to know whether this is a theoretical concern or
those users simply do not ever use THPs. Btw. should they be using THPs
even if they opt themselves into GFP_KERNEL restriction?
> If the THP configuration in sysfs says we should
> not
> be doing compaction/reclaim from THP allocations, we
> should obey that configuration setting, and use a
> gfp_flags that results in no compaction/reclaim being done.
Yes, I agree with that. The thing I disagree with is that you try to mix
how hard to try also from the shmem users which are not really THP aware
and they merely want to control how hard to try to order-0 pages. Or
more precisely whether to invoke OOM killer before doing their fallback.
So your patch adds a very subtle behavior that would be really hard to
maintain long term because the way how hart to compact is completely
detached from users who use the gfp mask.
--
Michal Hocko
SUSE Labs
On Mon, 2020-11-30 at 11:00 +0100, Michal Hocko wrote:
> On Fri 27-11-20 14:03:39, Rik van Riel wrote:
> > On Fri, 2020-11-27 at 08:52 +0100, Michal Hocko wrote:
> > > On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> > > > I would be more than happy to implement things differently,
> > > > but I am not sure what alternative you are suggesting.
> > >
> > > Simply do not alter gfp flags? Or warn in some cases of a serious
> > > mismatch.
> > > E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL
> > > users
> > > of
> > > shmem.
> >
> > Not altering the gfp flags is not really an option,
> > because that would leads to attempting to allocate THPs
> > with GFP_HIGHUSER, which is what is used to allocate
> > regular tmpfs pages.
>
> Right but that is a completely different reason to alter the mask and
> it
> would be really great to know whether this is a theoretical concern
> or
> those users simply do not ever use THPs. Btw. should they be using
> THPs
> even if they opt themselves into GFP_KERNEL restriction?
I suppose disabling THPs completely if the gfp_mask
passed to shmem_getpage_gfp() is not GFP_HIGHUSER
is another option.
That seems like it might come with its own pitfalls,
though, both functionality wise and maintenance wise.
Does anyone have
strong feelings between "limit gfp_mask"
and "disable THP for !GFP_HIGHUSER"?
--
All Rights Reversed.
On Tue, 24 Nov 2020, Rik van Riel wrote:
> The allocation flags of anonymous transparent huge pages can be controlled
> through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
> help the system from getting bogged down in the page reclaim and compaction
> code when many THPs are getting allocated simultaneously.
>
> However, the gfp_mask for shmem THP allocations were not limited by those
> configuration settings, and some workloads ended up with all CPUs stuck
> on the LRU lock in the page reclaim code, trying to allocate dozens of
> THPs simultaneously.
>
> This patch applies the same configurated limitation of THPs to shmem
> hugepage allocations, to prevent that from happening.
>
> This way a THP defrag setting of "never" or "defer+madvise" will result
> in quick allocation failures without direct reclaim when no 2MB free
> pages are available.
>
> With this patch applied, THP allocations for tmpfs will be a little
> more aggressive than today for files mmapped with MADV_HUGEPAGE,
> and a little less aggressive for files that are not mmapped or
> mapped without that flag.
>
> v6: make khugepaged actually obey tmpfs mount flags
> v5: reduce gfp mask further if needed, to accomodate i915 (Matthew Wilcox)
> v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
> v3: fix NULL vma issue spotted by Hugh Dickins & tested
> v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu
Andrew, please don't rush
mmthpshmem-limit-shmem-thp-alloc-gfp_mask.patch
mmthpshm-limit-gfp-mask-to-no-more-than-specified.patch
mmthpshmem-make-khugepaged-obey-tmpfs-mount-flags.patch
to Linus in your first wave of mmotm->5.11 sendings.
Or, alternatively, go ahead and send them to Linus, but
be aware that I'm fairly likely to want adjustments later.
Sorry for limping along so far behind, but I still have more
re-reading of the threads to do, and I'm still investigating
why tmpfs huge=always becomes so ineffective in my testing with
these changes, even if I ramp up from default defrag=madvise to
defrag=always:
5.10 mmotm
thp_file_alloc 4641788 216027
thp_file_fallback 275339 8895647
I've been looking into it off and on for weeks (gfp_mask wrangling is
not my favourite task! so tend to find higher priorities to divert me);
hoped to arrive at a conclusion before merge window, but still have
nothing constructive to say yet, hence my silence so far.
Above's "a little less aggressive" appears understatement at present.
I respect what Rik is trying to achieve here, and I may end up
concluding that there's nothing better to be done than what he has.
My kind of hugepage-thrashing-in-low-memory may be so remote from
normal usage, and could be skirting the latency horrors we all want
to avoid: but I haven't reached that conclusion yet - the disparity
in effectiveness still deserves more investigation.
(There's also a specific issue with the gfp_mask limiting: I have
not yet reviewed the allowing and denying in detail, but it looks
like it does not respect the caller's GFP_ZONEMASK - the gfp in
shmem_getpage_gfp() and shmem_read_mapping_page_gfp() is there to
satisfy the gma500, which wanted to use shmem but could only manage
DMA32. I doubt it wants THPS, but shmem_enabled=force forces them.)
Thanks,
Hugh
On Mon, 14 Dec 2020 13:16:39 -0800 (PST) Hugh Dickins <[email protected]> wrote:
> Andrew, please don't rush
>
> mmthpshmem-limit-shmem-thp-alloc-gfp_mask.patch
> mmthpshm-limit-gfp-mask-to-no-more-than-specified.patch
> mmthpshmem-make-khugepaged-obey-tmpfs-mount-flags.patch
>
> to Linus in your first wave of mmotm->5.11 sendings.
No probs - I'll park them until all this is sorted out. Thanks for
letting us know, and for your diligence ;)
On Mon, 14 Dec 2020, Vlastimil Babka wrote:
> On 12/14/20 10:16 PM, Hugh Dickins wrote:
> > On Tue, 24 Nov 2020, Rik van Riel wrote:
> >
> >> The allocation flags of anonymous transparent huge pages can be controlled
> >> through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
> >> help the system from getting bogged down in the page reclaim and compaction
> >> code when many THPs are getting allocated simultaneously.
> >>
> >> However, the gfp_mask for shmem THP allocations were not limited by those
> >> configuration settings, and some workloads ended up with all CPUs stuck
> >> on the LRU lock in the page reclaim code, trying to allocate dozens of
> >> THPs simultaneously.
> >>
> >> This patch applies the same configurated limitation of THPs to shmem
> >> hugepage allocations, to prevent that from happening.
> >>
> >> This way a THP defrag setting of "never" or "defer+madvise" will result
> >> in quick allocation failures without direct reclaim when no 2MB free
> >> pages are available.
> >>
> >> With this patch applied, THP allocations for tmpfs will be a little
> >> more aggressive than today for files mmapped with MADV_HUGEPAGE,
> >> and a little less aggressive for files that are not mmapped or
> >> mapped without that flag.
> >>
> >> v6: make khugepaged actually obey tmpfs mount flags
> >> v5: reduce gfp mask further if needed, to accomodate i915 (Matthew Wilcox)
> >> v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
> >> v3: fix NULL vma issue spotted by Hugh Dickins & tested
> >> v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu
> >
> > Andrew, please don't rush
> >
> > mmthpshmem-limit-shmem-thp-alloc-gfp_mask.patch
> > mmthpshm-limit-gfp-mask-to-no-more-than-specified.patch
> > mmthpshmem-make-khugepaged-obey-tmpfs-mount-flags.patch
> >
> > to Linus in your first wave of mmotm->5.11 sendings.
> > Or, alternatively, go ahead and send them to Linus, but
> > be aware that I'm fairly likely to want adjustments later.
And I have a suspicion that Andrew might want to send these in
for 5.12.
I spent a lot of time trying to find a compromise that would
satisfy us all, but failed to do so. I kept hoping to find one
next day, so never reached the point of responding.
My fundamental objection to Rik's gfp_mask patches (the third
is different, looks good, though I never studied it properly) is
that (as I said right at the start) anyone who uses a huge=always
mount of tmpfs is already advising for huge pages. The situation is
different from anon, where everything on a machine with THP enabled
is liable to get huge pages, and an madvise necessary to distinguish
who wants. (madvise is also quite the wrong tool for a filesystem.)
But when I tried modifying the patches to reflect that huge=always
already advises for huge, that did not give a satisfactory result
either: precisely what was wrong with every combination I tried,
I do not have to hand - again, I was hoping for a success which
did not arrive.
But if Andrew wants to put these in, I'll no longer object to their
inclusion: it seems wrong to me, to replace one unsatisfactory array
of choices by another unsatisfactory array of choices, but in the end
it's to be decided by what users prefer - if we hear of regressions
(people not getting the huge pages that they have come to expect),
then the patches will have to be reverted.
> >
> > Sorry for limping along so far behind, but I still have more
> > re-reading of the threads to do, and I'm still investigating
> > why tmpfs huge=always becomes so ineffective in my testing with
> > these changes, even if I ramp up from default defrag=madvise to
> > defrag=always:
> > 5.10 mmotm
> > thp_file_alloc 4641788 216027
> > thp_file_fallback 275339 8895647
I never devised a suitable test to prove it, but I did come to
believe that the worrying scale of that regression comes from the
kind of unrealistic testing I'm doing, and would not be nearly so
bad in "real life".
Since I'm interested in exercising the assembly and splitting of
huge pages for testing, I'm happy to run kernel builds of many small
source files in a limited huge=always tmpfs in limited memory. But
for that to work, for the files allocated hugely to fit in, it does
depend on direct reclaim and kswapd to split and shrink the ends of the
older files, so compaction can make huge pages available to the newer.
Whereas most people should be using huge tmpfs more appropriately,
for huge files.
>
> So AFAICS before the patch shmem allocated hugepages basically with:
> mapping_gfp_mask(inode->i_mapping) | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN
> where mapping_gfp_mask() should be the default GFP_HIGHUSER_MOVABLE unless I
> missed some shmem-specific override of the mask.
>
> So the important flags mean all zones avilable, both __GFP_DIRECT_RECLAIM and
> __GFP_KSWAPD_RECLAIM, but also __GFP_NORETRY which makes it less aggressive.
>
> Now, with defrag=madvise and without madvised vma, there's just
> GFP_TRANSHUGE_LIGHT, which means no __GFP_DIRECT_RECLAIM (and no
> __GFP_KSWAPD_RECLAIM). Thus no reclaim and compaction at all. Indeed "little
> less aggressive" is an understatement.
Yes, I remember agreeing with your analysis.
>
> On the other hand, with defrag=always and again without madvised vma there
> should be GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM | __GFP_NORETRY, so
> compared to "before the patch" this is only missing __GFP_KSWAPD_RECLAIM. I
> would be surprised if this meant so much difference in your testing as you show
> above - I think you would have to be allocating those THPs just at a rate where
> kswapd+kcompactd can keep up and nothing else "steals" the pages that background
> reclaim+compaction creates.
> In that (subjectively unlikely) case, I think significant improvement should be
> visible with defrag=defer over defrag=madvise.
I do have reams of results from trying different things, but I'm not
referring to them now: I think I found that, as you supposed, defrag=defer
did give the closest approximation to what I was seeing before the patches;
but was still inferior, in both THP success rate and overall time taken.
But going that way, we might be suggesting one mode of defrag for shmem,
and a different mode of defrag for anon; whereas Rik's intent was to
unify, without more knobs.
>
> > I've been looking into it off and on for weeks (gfp_mask wrangling is
> > not my favourite task! so tend to find higher priorities to divert me);
> > hoped to arrive at a conclusion before merge window, but still have
> > nothing constructive to say yet, hence my silence so far.
> >
> > Above's "a little less aggressive" appears understatement at present.
> > I respect what Rik is trying to achieve here, and I may end up
> > concluding that there's nothing better to be done than what he has.
> > My kind of hugepage-thrashing-in-low-memory may be so remote from
> > normal usage, and could be skirting the latency horrors we all want
> > to avoid: but I haven't reached that conclusion yet - the disparity
> > in effectiveness still deserves more investigation.
> >
> > (There's also a specific issue with the gfp_mask limiting: I have
> > not yet reviewed the allowing and denying in detail, but it looks
> > like it does not respect the caller's GFP_ZONEMASK - the gfp in
> > shmem_getpage_gfp() and shmem_read_mapping_page_gfp() is there to
> > satisfy the gma500, which wanted to use shmem but could only manage
> > DMA32. I doubt it wants THPS, but shmem_enabled=force forces them.)
Oh, I'd forgotten all about that gma500 aspect:
well, I can send a fixup later on.
> >
> > Thanks,
> > Hugh
On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:
> On Mon, 14 Dec 2020, Vlastimil Babka wrote:
>
> > > (There's also a specific issue with the gfp_mask limiting: I have
> > > not yet reviewed the allowing and denying in detail, but it looks
> > > like it does not respect the caller's GFP_ZONEMASK - the gfp in
> > > shmem_getpage_gfp() and shmem_read_mapping_page_gfp() is there to
> > > satisfy the gma500, which wanted to use shmem but could only
> manage
> > > DMA32. I doubt it wants THPS, but shmem_enabled=force forces
> them.)
>
> Oh, I'd forgotten all about that gma500 aspect:
> well, I can send a fixup later on.
I already have code to fix that, which somebody earlier
in this discussion convinced me to throw away. Want me
to send it as a patch 4/3 ?
--
All Rights Reversed.
On Wed, 24 Feb 2021, Rik van Riel wrote:
> On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:
> > On Mon, 14 Dec 2020, Vlastimil Babka wrote:
> >
> > > > (There's also a specific issue with the gfp_mask limiting: I have
> > > > not yet reviewed the allowing and denying in detail, but it looks
> > > > like it does not respect the caller's GFP_ZONEMASK - the gfp in
> > > > shmem_getpage_gfp() and shmem_read_mapping_page_gfp() is there to
> > > > satisfy the gma500, which wanted to use shmem but could only
> > manage
> > > > DMA32. I doubt it wants THPS, but shmem_enabled=force forces
> > them.)
> >
> > Oh, I'd forgotten all about that gma500 aspect:
> > well, I can send a fixup later on.
>
> I already have code to fix that, which somebody earlier
> in this discussion convinced me to throw away. Want me
> to send it as a patch 4/3 ?
If Andrew wants it all, yes, please do add that - thanks Rik.
Hugh
On Wed, 24 Feb 2021 08:55:40 -0800 (PST)
Hugh Dickins <[email protected]> wrote:
> On Wed, 24 Feb 2021, Rik van Riel wrote:
> > On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:
> > > Oh, I'd forgotten all about that gma500 aspect:
> > > well, I can send a fixup later on.
> >
> > I already have code to fix that, which somebody earlier
> > in this discussion convinced me to throw away. Want me
> > to send it as a patch 4/3 ?
>
> If Andrew wants it all, yes, please do add that - thanks Rik.
Trivial patch to fix the gma500 thing below:
---8<---
mm,shmem,thp: limit shmem THP allocations to requested zones
Hugh pointed out that the gma500 driver uses shmem pages, but needs
to limit them to the DMA32 zone. Ensure the allocations resulting from
the gfp_mask returned by limit_gfp_mask use the zone flags that were
originally passed to shmem_getpage_gfp.
Signed-off-by: Rik van Riel <[email protected]>
Suggested-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index ee3cea10c2a4..876fec89686f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1539,7 +1539,11 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
- gfp_t result = huge_gfp & ~allowflags;
+ gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+ gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+ /* Allow allocations only from the originally specified zones. */
+ result |= zoneflags;
/*
* Minimize the result gfp by taking the union with the deny flags,
On 2/24/21 6:10 PM, Rik van Riel wrote:
> On Wed, 24 Feb 2021 08:55:40 -0800 (PST)
> Hugh Dickins <[email protected]> wrote:
>> On Wed, 24 Feb 2021, Rik van Riel wrote:
>> > On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:
>> > > Oh, I'd forgotten all about that gma500 aspect:
>> > > well, I can send a fixup later on.
>> >
>> > I already have code to fix that, which somebody earlier
>> > in this discussion convinced me to throw away. Want me
>> > to send it as a patch 4/3 ?
>>
>> If Andrew wants it all, yes, please do add that - thanks Rik.
>
> Trivial patch to fix the gma500 thing below:
>
> ---8<---
>
> mm,shmem,thp: limit shmem THP allocations to requested zones
>
> Hugh pointed out that the gma500 driver uses shmem pages, but needs
> to limit them to the DMA32 zone. Ensure the allocations resulting from
> the gfp_mask returned by limit_gfp_mask use the zone flags that were
> originally passed to shmem_getpage_gfp.
>
> Signed-off-by: Rik van Riel <[email protected]>
> Suggested-by: Hugh Dickins <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
> ---
> mm/shmem.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ee3cea10c2a4..876fec89686f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1539,7 +1539,11 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
> {
> gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
> gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
> - gfp_t result = huge_gfp & ~allowflags;
> + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
> + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
> +
> + /* Allow allocations only from the originally specified zones. */
> + result |= zoneflags;
>
> /*
> * Minimize the result gfp by taking the union with the deny flags,
>